Skip to content

Commit

Permalink
addressing the case when output region for repeat operation is too big
Browse files Browse the repository at this point in the history
  • Loading branch information
ipdemes committed May 31, 2022
1 parent 472d6d0 commit 17db7be
Show file tree
Hide file tree
Showing 4 changed files with 66 additions and 10 deletions.
66 changes: 57 additions & 9 deletions cunumeric/module.py
Original file line number Diff line number Diff line change
Expand Up @@ -2046,7 +2046,6 @@ def repeat(a, repeats, axis=None):
--------
Multiple GPUs, Multiple CPUs
"""

# when array is a scalar
if np.ndim(a) == 0:
if np.ndim(repeats) == 0:
Expand Down Expand Up @@ -2100,11 +2099,37 @@ def repeat(a, repeats, axis=None):
category=UserWarning,
)
repeats = np.int64(repeats)
result = array._thunk.repeat(
repeats=repeats,
axis=axis,
scalar_repeats=True,
)
if repeats < 0:
return ValueError(
"'repeats' should not be negative: {}".format(repeats)
)

# check output shape (if it will fit to GPU or not)
out_shape = list(array.shape)
out_shape[axis] *= repeats
out_shape = tuple(out_shape)
size = sum(out_shape) * array.itemsize
# check if size of the output array is less 8GB. In this case we can
# use output regions, otherwise we will use statcally allocated
# array
print("IRINA DEBUG 1", size, (8589934592 / 2 - size))
if size < 8589934592 / 2:

result = array._thunk.repeat(
repeats=repeats, axis=axis, scalar_repeats=True
)
else:
# this implementation is taken from CuPy
result = ndarray(shape=out_shape, dtype=array.dtype)
a_index = [slice(None)] * len(out_shape)
res_index = list(a_index)
offset = 0
for i in range(a._shape[axis]):
a_index[axis] = slice(i, i + 1)
res_index[axis] = slice(offset, offset + repeats)
result[res_index] = array[a_index]
offset += repeats
return result
# repeats is an array
else:
# repeats should be integer type
Expand All @@ -2116,9 +2141,32 @@ def repeat(a, repeats, axis=None):
repeats = repeats.astype(np.int64)
if repeats.shape[0] != array.shape[axis]:
return ValueError("incorrect shape of repeats array")
result = array._thunk.repeat(
repeats=repeats._thunk, axis=axis, scalar_repeats=False
)

# check output shape (if it will fit to GPU or not)
out_shape = list(array.shape)
n_repeats = sum(repeats)
out_shape[axis] = n_repeats
out_shape = tuple(out_shape)
size = sum(out_shape) * array.itemsize
# check if size of the output array is less 8GB. In this case we can
# use output regions, otherwise we will use statcally allocated
# array
print("IRINA DEBUG 1", size, (8589934592 / 2 - size))
if size < 8589934592 / 2:
result = array._thunk.repeat(
repeats=repeats._thunk, axis=axis, scalar_repeats=False
)
else: # this implementation is taken from CuPy
result = ndarray(shape=out_shape, dtype=array.dtype)
a_index = [slice(None)] * len(out_shape)
res_index = list(a_index)
offset = 0
for i in range(a._shape[axis]):
a_index[axis] = slice(i, i + 1)
res_index[axis] = slice(offset, offset + repeats[i])
result[res_index] = array[a_index]
offset += repeats[i]
return result
return ndarray(shape=result.shape, thunk=result)


Expand Down
4 changes: 4 additions & 0 deletions src/cunumeric/index/repeat.cc
Original file line number Diff line number Diff line change
Expand Up @@ -69,6 +69,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
int64_t out_idx = 0;
for (size_t in_idx = 0; in_idx < volume; ++in_idx) {
auto p = in_pitches.unflatten(in_idx, in_rect.lo);
// TODO replace assert with Legate exception handeling interface when available
assert(repeats[p] >= 0);
for (size_t r = 0; r < repeats[p]; r++) out[out_idx++] = in[p];
}
}
Expand All @@ -88,6 +90,8 @@ struct RepeatImplBody<VariantKind::CPU, CODE, DIM> {
for (int64_t idx = in_rect.lo[axis]; idx <= in_rect.hi[axis]; ++idx) {
p[axis] = idx;
offsets[off_idx++] = sum;
// TODO replace assert with Legate exception handeling interface when available
assert(repeats[p] >= 0);
sum += repeats[p];
}

Expand Down
2 changes: 2 additions & 0 deletions src/cunumeric/index/repeat.cu
Original file line number Diff line number Diff line change
Expand Up @@ -41,6 +41,8 @@ static __global__ void __launch_bounds__(THREADS_PER_BLOCK, MIN_CTAS_PER_SM)
if (offset < extent) {
auto p = origin;
p[axis] += offset;
// TODO replace assert with Legate exception handeling interface when available
assert(repeats[p] >= 0);
auto val = repeats[p];
offsets[offset] = val;
SumReduction<int64_t>::fold<true>(value, val);
Expand Down
4 changes: 3 additions & 1 deletion src/cunumeric/index/repeat_omp.cc
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,9 @@ struct RepeatImplBody<VariantKind::OMP, CODE, DIM> {
int64_t axis_lo = p[axis];
#pragma omp for schedule(static) private(p)
for (int64_t idx = 0; idx < axis_extent; ++idx) {
p[axis] = axis_lo + idx;
p[axis] = axis_lo + idx;
// TODO replace assert with Legate exception handeling interface when available
assert(repeats[p] >= 0);
auto val = repeats[p];
offsets[idx] = val;
local_sums[tid] += val;
Expand Down

0 comments on commit 17db7be

Please sign in to comment.