Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix: test untested path in ak.cartesian & broadcasting #2329

Merged
merged 13 commits into from
Mar 21, 2023
5 changes: 3 additions & 2 deletions src/awkward/_broadcasting.py
Original file line number Diff line number Diff line change
Expand Up @@ -160,7 +160,7 @@ def all_same_offsets(backend: ak._backends.Backend, inputs: list) -> bool:
my_offsets = index_nplike.empty(0, dtype=np.int64)
else:
my_offsets = index_nplike.arange(
0, x.content.length, x.size, dtype=np.int64
0, x.content.length + 1, x.size, dtype=np.int64
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We want a range that is inclusive of the endpoints, which is not the case by default.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We do? Oh, yes—we do! Offsets always have to be one longer than the array itself; that's right. Good catch!

)

if offsets is None:
Expand Down Expand Up @@ -841,7 +841,8 @@ def continuation():
else:
lencontent = backend.index_nplike.max(stops)
nextinputs.append(x.content[:lencontent])

elif isinstance(x, RegularArray):
nextinputs.append(x.content[: x.size * x.length])
Comment on lines +844 to +845
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We test for RegularArray in all_same_offsets, but here we weren't actually pulling out the content.

else:
nextinputs.append(x)

Expand Down
280 changes: 139 additions & 141 deletions src/awkward/operations/ak_cartesian.py
Original file line number Diff line number Diff line change
Expand Up @@ -177,10 +177,7 @@ def cartesian(
[(4, 3.3, 'a'), (4, 3.3, 'b')]]

The order of the output is fixed: it is always lexicographical in the
order that the `arrays` are written. (Before Python 3.6, the order of
keys in a dict were not guaranteed, so the dict interface is not
recommended for these versions of Python.) Thus, it is not possible to
group by `three` in the example above.
order that the `arrays` are written.

To emulate an SQL or Pandas "group by" operation, put the keys that you
wish to group by *first* and use `nested=[0]` or `nested=[n]` to group by
Expand Down Expand Up @@ -212,23 +209,23 @@ def _impl(arrays, axis, nested, parameters, with_name, highlevel, behavior):
if isinstance(arrays, dict):
backend = ak._backends.backend_of(*arrays.values(), default=cpu)
behavior = behavior_of(*arrays.values(), behavior=behavior)
new_arrays = {}
for n, x in arrays.items():
new_arrays[n] = ak.operations.to_layout(
x, allow_record=False, allow_other=False
array_layouts = {
name: ak.operations.to_layout(
layout, allow_record=False, allow_other=False
).to_backend(backend)
for name, layout in arrays.items()
}

else:
arrays = list(arrays)
backend = ak._backends.backend_of(*arrays, default=cpu)
behavior = behavior_of(*arrays, behavior=behavior)
new_arrays = []
for x in arrays:
new_arrays.append(
ak.operations.to_layout(
x, allow_record=False, allow_other=False
).to_backend(backend)
)
array_layouts = [
ak.operations.to_layout(
layout, allow_record=False, allow_other=False
).to_backend(backend)
for layout in arrays
]

if with_name is not None:
if parameters is None:
Expand All @@ -237,65 +234,72 @@ def _impl(arrays, axis, nested, parameters, with_name, highlevel, behavior):
parameters = dict(parameters)
parameters["__record__"] = with_name

if isinstance(new_arrays, dict):
new_arrays_values = list(new_arrays.values())
if isinstance(array_layouts, dict):
layouts = list(array_layouts.values())
else:
new_arrays_values = new_arrays
layouts = array_layouts

posaxis = maybe_posaxis(layouts[0], axis, 1)

posaxis = maybe_posaxis(new_arrays_values[0], axis, 1)
# Validate `posaxis`
if posaxis is None or posaxis < 0:
raise ak._errors.wrap_error(ValueError("negative axis depth is ambiguous"))
for x in new_arrays_values[1:]:
if maybe_posaxis(x, axis, 1) != posaxis:
for layout in layouts[1:]:
if maybe_posaxis(layout, axis, 1) != posaxis:
raise ak._errors.wrap_error(
ValueError(
"arrays to cartesian-product do not have the same depth for negative axis"
)
)

if posaxis == 0:
if nested is None or nested is False:
nested = []

if isinstance(new_arrays, dict):
if nested is True:
nested = list(new_arrays.keys()) # last key is ignored below
if any(not (isinstance(n, str) and n in new_arrays) for x in nested):
# Validate `nested`
if nested is None or nested is False:
nested = []
elif nested is True:
if isinstance(array_layouts, dict):
nested = list(array_layouts.keys())[:-1]
else:
nested = list(range(len(array_layouts))[:-1])
else:
if isinstance(array_layouts, dict):
if any(not (isinstance(x, str) and x in array_layouts) for x in nested):
raise ak._errors.wrap_error(
ValueError(
"the 'nested' parameter of cartesian must be dict keys "
"for a dict of arrays"
)
)
fields = []
layouts = []
tonested = []
for i, (n, x) in enumerate(new_arrays.items()):
fields.append(n)
layouts.append(x)
if n in nested:
tonested.append(i)
nested = tonested

if len(nested) >= len(array_layouts):
raise ak._errors.wrap_error(
ValueError(
"the `nested` parameter of cartesian must contain "
"fewer items than there are arrays"
)
)
else:
if nested is True:
nested = list(range(len(new_arrays) - 1))
if any(
not (isinstance(x, int) and 0 <= x < len(new_arrays) - 1)
not (isinstance(x, int) and 0 <= x < len(array_layouts) - 1)
for x in nested
):
raise ak._errors.wrap_error(
ValueError(
"the 'nested' prarmeter of cartesian must be integers in "
"the 'nested' parameter of cartesian must be integers in "
"[0, len(arrays) - 1) for an iterable of arrays"
)
)
fields = None
layouts = []
for x in new_arrays:
layouts.append(x)

layouts = list(layouts)
if posaxis == 0:
if isinstance(array_layouts, dict):
fields = []
tonested = []
for i, (name, _) in enumerate(array_layouts.items()):
fields.append(name)
if name in nested:
tonested.append(i)
nested = tonested

else:
fields = None

indexes = [
ak.index.Index64(backend.index_nplike.reshape(x, (-1,)))
Expand All @@ -305,126 +309,120 @@ def _impl(arrays, axis, nested, parameters, with_name, highlevel, behavior):
)
]
outs = [
ak.contents.IndexedArray.simplified(x, y)
for x, y in __builtins__["zip"](indexes, layouts)
ak.contents.IndexedArray.simplified(x, y) for x, y in zip(indexes, layouts)
]

result = ak.contents.RecordArray(outs, fields, parameters=parameters)
for i in range(len(new_arrays) - 1, -1, -1):
for i in range(len(array_layouts))[::-1]:
if i in nested:
result = ak.contents.RegularArray(result, len(layouts[i + 1]), 0)
result = ak.contents.RegularArray(result, layouts[i + 1].length, 0)

else:

def newaxis(layout, i):
if i == 0:
def add_outer_dimensions(
layout: ak.contents.Content, n: int
) -> ak.contents.Content:
if n == 0:
return layout
else:
return ak.contents.RegularArray(newaxis(layout, i - 1), 1, 0)

def getgetfunction1(i, **kwargs):
def getfunction1(layout, depth, **kwargs):
if depth == 2:
return newaxis(layout, i)
else:
return None

return getfunction1

def getgetfunction2(i, **kwargs):
def getfunction2(layout, depth, **kwargs):
if depth == posaxis:
inside = len(new_arrays) - i - 1
outside = i
if (
layout.parameter("__array__") == "string"
or layout.parameter("__array__") == "bytestring"
):
raise ak._errors.wrap_error(
ValueError(
"ak.cartesian does not compute combinations of the "
"characters of a string; please split it into lists"
)
)
nextlayout = ak._do.recursively_apply(
layout, getgetfunction1(inside), behavior
)
return newaxis(nextlayout, outside)
else:
return None

return getfunction2

def apply(x, i):
layout = ak.operations.to_layout(x, allow_record=False, allow_other=False)
return ak._do.recursively_apply(layout, getgetfunction2(i), behavior)
return ak.contents.RegularArray(
add_outer_dimensions(layout, n - 1), 1, 0
)

toflatten = []
if nested is None or nested is False:
nested = []
def apply_pad_inner_list(layout, depth, lateral_context, **kwargs):
"""
Add new dimensions (given by lateral_context["n"]) above innermost list
"""
n = lateral_context["n"]
# We want to be above at least one dimension (list)
if depth == 2:
return add_outer_dimensions(layout, n)
else:
return None

if isinstance(new_arrays, dict):
if nested is True:
nested = list(new_arrays.keys()) # last key is ignored below
if any(not (isinstance(n, str) and n in new_arrays) for x in nested):
raise ak._errors.wrap_error(
ValueError(
"the 'nested' parameter of cartesian must be dict keys "
"for a dict of arrays"
def apply_pad_inner_list_at_axis(layout, depth, lateral_context, **kwargs):
"""
Each array in arrays contributes to one of these new dimensions.
To make the cartesian product of the given arrays broadcastable,
each array is padded by (n, m) new length-1 regular dimensions
(above, below) the target depth. The values of (n, m) are given by
the position of the array; the first array is the outermost axis.
"""
i = lateral_context["i"]
if depth == posaxis:
n_inside = len(array_layouts) - i - 1
n_outside = i
if (
layout.parameter("__array__") == "string"
or layout.parameter("__array__") == "bytestring"
):
raise ak._errors.wrap_error(
ValueError(
"ak.cartesian does not compute combinations of the "
"characters of a string; please split it into lists"
)
)
nextlayout = ak._do.recursively_apply(
layout,
apply_pad_inner_list,
behavior,
lateral_context={"n": n_inside},
)
fields = []
layouts = []
for i, (n, x) in enumerate(new_arrays.items()):
fields.append(n)
layouts.append(apply(x, i))
if i < len(new_arrays) - 1 and n not in nested:
toflatten.append(posaxis + i + 1)
return add_outer_dimensions(nextlayout, n_outside)
else:
return None

else:
if nested is True:
nested = list(range(len(new_arrays) - 1))
if any(
not (isinstance(x, int) and 0 <= x < len(new_arrays) - 1)
for x in nested
):
raise ak._errors.wrap_error(
ValueError(
"the 'nested' parameter of cartesian must be integers in "
"[0, len(arrays) - 1) for an iterable of arrays"
)
# New _interior_ axes are added to the result layout, but
# unless explicitly named, these axes should be flattened.
axes_to_flatten = [
posaxis + i + 1
for i, _ in enumerate(array_layouts)
if i < len(array_layouts) - 1 and i not in nested
]
# This list *must* be sorted in reverse order
axes_to_flatten.reverse()

if isinstance(array_layouts, dict):
fields = list(array_layouts.keys())
new_layouts = [
ak._do.recursively_apply(
layout,
apply_pad_inner_list_at_axis,
behavior,
lateral_context={"i": i},
)
for i, (_, layout) in enumerate(array_layouts.items())
]

else:
fields = None
layouts = []
for i, x in enumerate(new_arrays):
layouts.append(apply(x, i))
if i < len(new_arrays) - 1 and i not in nested:
toflatten.append(posaxis + i + 1)

def getfunction3(inputs, depth, **kwargs):
if depth == posaxis + len(new_arrays):
if all(len(x) == 0 for x in inputs):
inputs = [
x.content
if isinstance(x, ak.contents.RegularArray) and x.size == 1
else x
for x in inputs
]
new_layouts = [
ak._do.recursively_apply(
layout,
apply_pad_inner_list_at_axis,
behavior,
lateral_context={"i": i},
)
for i, layout in enumerate(array_layouts)
]

def apply_build_record(inputs, depth, **kwargs):
if depth == posaxis + len(array_layouts):
return (ak.contents.RecordArray(inputs, fields, parameters=parameters),)

else:
return None

out = ak._broadcasting.broadcast_and_apply(
layouts, getfunction3, behavior, right_broadcast=False
new_layouts, apply_build_record, behavior, right_broadcast=False
)
assert isinstance(out, tuple) and len(out) == 1
result = out[0]

while len(toflatten) != 0:
flatten_axis = toflatten.pop()
# Remove surplus dimensions, iterating from smallest to greatest
for axis_to_flatten in axes_to_flatten:
result = ak.operations.flatten(
result, axis=flatten_axis, highlevel=False, behavior=behavior
result, axis=axis_to_flatten, highlevel=False, behavior=behavior
)

return wrap_layout(result, behavior, highlevel)
Loading