From 7fd07d0fdeae920950a687bf0687222022653f2d Mon Sep 17 00:00:00 2001
From: Gregory Lee <grlee77@gmail.com>
Date: Thu, 2 Feb 2023 01:17:20 -0800
Subject: [PATCH] use vendored version of cupy.pad with added performance
 optimizations (#482)

## Overview

This version provides faster, elementwise kernel implementations for common padding modes.

It is under `_vendored` because most of `pad.py` is copied from CuPy itself. The only new part there is the `_use_elementwise_kernel` utility and the conditional branch where it evaluates to True. The newly written code is mostly in `pad_elementwise.py`.

I could potentially further refactor `pad.py` to remove most of the code and just call out to `cupy.pad` instead whenever we aren't using the elementwise kernels.

This version should also be submited upstream to CuPy itself.

Padding performance is substantially improved for modes `edge`, `symmetric`, `reflect` and `wrap`. Most places in cuCIM where we use padding, it is not the bottleneck, but it should still provide a small performance improvement in several places. I ran some benchmarks, and the largest impact I saw was around 25% reduction in run-time for `chan_vese`.


## Benchmark Results (vs. `cupy.pad`)

In the following, the next-to-last column is the overall acceleration observed. It is large for small 2D or 3D images (>5x) and becomes relatively small for larger images (e.g. ~10% for 4k images).

The final column only relates to the amount of time spent on the host. That "accel. CPU" number always strongly favors the new implementation. It has lower host overhead because everything is done in a single kernel call rather than potentially using multiple kernels for each axis in turn. This kernel launch overhead explains why the overall benefit is much higher for the smaller image sizes.

shape | pad_width | dtype | mode | order | duration, old (ms) | duration, new (ms) | accel. | accel. CPU
------|-----------|-------|------|-------|--------------------|--------------------|--------|-----------
(256, 256) | 2 | uint8 | edge | C | 0.1278 | 0.0230 | 5.563 | 6.298
(256, 256) | 2 | uint8 | symmetric | C | 0.1286 | 0.0230 | 5.583 | 6.268
(256, 256) | 2 | uint8 | reflect | C | 0.1294 | 0.0236 | 5.479 | 6.165
(256, 256) | 2 | uint8 | wrap | C | 0.1246 | 0.0228 | 5.468 | 6.149
(256, 256) | 16 | uint8 | edge | C | 0.1276 | 0.0229 | 5.563 | 6.269
(256, 256) | 16 | uint8 | symmetric | C | 0.1305 | 0.0231 | 5.645 | 6.366
(256, 256) | 16 | uint8 | reflect | C | 0.1300 | 0.0235 | 5.539 | 6.220
(256, 256) | 16 | uint8 | wrap | C | 0.1270 | 0.0228 | 5.568 | 6.268
(256, 256) | 2 | uint8 | edge | F | 0.1300 | 0.0234 | 5.567 | 6.281
(256, 256) | 2 | uint8 | symmetric | F | 0.1291 | 0.0236 | 5.471 | 6.157
(256, 256) | 2 | uint8 | reflect | F | 0.1294 | 0.0238 | 5.427 | 6.080
(256, 256) | 2 | uint8 | wrap | F | 0.1254 | 0.0234 | 5.363 | 6.043
(256, 256) | 16 | uint8 | edge | F | 0.1279 | 0.0232 | 5.506 | 6.315
(256, 256) | 16 | uint8 | symmetric | F | 0.1294 | 0.0236 | 5.472 | 6.319
(256, 256) | 16 | uint8 | reflect | F | 0.1300 | 0.0239 | 5.434 | 6.262
(256, 256) | 16 | uint8 | wrap | F | 0.1262 | 0.0238 | 5.310 | 6.134
(1024, 1024) | 2 | uint8 | edge | C | 0.1279 | 0.0255 | 5.020 | 6.287
(1024, 1024) | 2 | uint8 | symmetric | C | 0.1285 | 0.0258 | 4.980 | 6.259
(1024, 1024) | 2 | uint8 | reflect | C | 0.1286 | 0.0263 | 4.888 | 6.118
(1024, 1024) | 2 | uint8 | wrap | C | 0.1253 | 0.0255 | 4.905 | 6.170
(1024, 1024) | 16 | uint8 | edge | C | 0.1277 | 0.0258 | 4.947 | 6.270
(1024, 1024) | 16 | uint8 | symmetric | C | 0.1286 | 0.0261 | 4.931 | 6.296
(1024, 1024) | 16 | uint8 | reflect | C | 0.1280 | 0.0264 | 4.845 | 6.132
(1024, 1024) | 16 | uint8 | wrap | C | 0.1249 | 0.0260 | 4.798 | 6.095
(1024, 1024) | 2 | uint8 | edge | F | 0.1289 | 0.0581 | 2.217 | 6.084
(1024, 1024) | 2 | uint8 | symmetric | F | 0.1304 | 0.0586 | 2.227 | 6.064
(1024, 1024) | 2 | uint8 | reflect | F | 0.1331 | 0.0590 | 2.257 | 6.059
(1024, 1024) | 2 | uint8 | wrap | F | 0.1278 | 0.0586 | 2.180 | 5.994
(1024, 1024) | 16 | uint8 | edge | F | 0.1299 | 0.0604 | 2.149 | 6.238
(1024, 1024) | 16 | uint8 | symmetric | F | 0.1315 | 0.0607 | 2.168 | 6.255
(1024, 1024) | 16 | uint8 | reflect | F | 0.1309 | 0.0614 | 2.133 | 6.070
(1024, 1024) | 16 | uint8 | wrap | F | 0.1275 | 0.0606 | 2.103 | 6.105
(4096, 4096) | 2 | uint8 | edge | C | 0.1291 | 0.1143 | 1.130 | 6.202
(4096, 4096) | 2 | uint8 | symmetric | C | 0.1296 | 0.1132 | 1.145 | 6.183
(4096, 4096) | 2 | uint8 | reflect | C | 0.1295 | 0.1151 | 1.125 | 6.064
(4096, 4096) | 2 | uint8 | wrap | C | 0.1266 | 0.1138 | 1.112 | 6.029
(4096, 4096) | 16 | uint8 | edge | C | 0.1295 | 0.1157 | 1.119 | 6.212
(4096, 4096) | 16 | uint8 | symmetric | C | 0.1301 | 0.1150 | 1.131 | 6.208
(4096, 4096) | 16 | uint8 | reflect | C | 0.1302 | 0.1168 | 1.115 | 6.088
(4096, 4096) | 16 | uint8 | wrap | C | 0.1272 | 0.1153 | 1.103 | 6.065
(4096, 4096) | 2 | uint8 | edge | F | 0.6624 | 0.6433 | 1.030 | 6.228
(4096, 4096) | 2 | uint8 | symmetric | F | 0.6639 | 0.6438 | 1.031 | 6.133
(4096, 4096) | 2 | uint8 | reflect | F | 0.6640 | 0.6441 | 1.031 | 6.003
(4096, 4096) | 2 | uint8 | wrap | F | 0.6638 | 0.6454 | 1.028 | 6.037
(4096, 4096) | 16 | uint8 | edge | F | 0.6909 | 0.6713 | 1.029 | 6.318
(4096, 4096) | 16 | uint8 | symmetric | F | 0.6915 | 0.6717 | 1.029 | 6.229
(4096, 4096) | 16 | uint8 | reflect | F | 0.6919 | 0.6724 | 1.029 | 6.082
(4096, 4096) | 16 | uint8 | wrap | F | 0.6923 | 0.6720 | 1.030 | 6.136
(40, 40, 40) | 2 | uint8 | edge | C | 0.2057 | 0.0239 | 8.610 | 9.765
(40, 40, 40) | 2 | uint8 | symmetric | C | 0.2014 | 0.0241 | 8.357 | 9.450
(40, 40, 40) | 2 | uint8 | reflect | C | 0.1999 | 0.0245 | 8.169 | 9.227
(40, 40, 40) | 2 | uint8 | wrap | C | 0.1969 | 0.0237 | 8.299 | 9.405
(40, 40, 40) | 16 | uint8 | edge | C | 0.2028 | 0.0235 | 8.633 | 9.760
(40, 40, 40) | 16 | uint8 | symmetric | C | 0.2000 | 0.0255 | 7.844 | 9.502
(40, 40, 40) | 16 | uint8 | reflect | C | 0.1988 | 0.0250 | 7.946 | 9.339
(40, 40, 40) | 16 | uint8 | wrap | C | 0.1948 | 0.0248 | 7.871 | 9.371
(40, 40, 40) | 2 | uint8 | edge | F | 0.1980 | 0.0248 | 7.994 | 9.322
(40, 40, 40) | 2 | uint8 | symmetric | F | 0.1963 | 0.0250 | 7.840 | 9.159
(40, 40, 40) | 2 | uint8 | reflect | F | 0.1952 | 0.0253 | 7.729 | 8.985
(40, 40, 40) | 2 | uint8 | wrap | F | 0.1898 | 0.0251 | 7.567 | 8.847
(40, 40, 40) | 16 | uint8 | edge | F | 0.1997 | 0.0331 | 6.035 | 9.161
(40, 40, 40) | 16 | uint8 | symmetric | F | 0.1964 | 0.0349 | 5.622 | 8.393
(40, 40, 40) | 16 | uint8 | reflect | F | 0.1967 | 0.0339 | 5.793 | 8.808
(40, 40, 40) | 16 | uint8 | wrap | F | 0.1924 | 0.0334 | 5.762 | 8.909
(100, 100, 100) | 2 | uint8 | edge | C | 0.2042 | 0.0288 | 7.101 | 9.676
(100, 100, 100) | 2 | uint8 | symmetric | C | 0.1994 | 0.0317 | 6.294 | 9.334
(100, 100, 100) | 2 | uint8 | reflect | C | 0.2007 | 0.0302 | 6.634 | 9.287
(100, 100, 100) | 2 | uint8 | wrap | C | 0.1946 | 0.0308 | 6.315 | 9.179
(100, 100, 100) | 16 | uint8 | edge | C | 0.2023 | 0.0369 | 5.483 | 9.468
(100, 100, 100) | 16 | uint8 | symmetric | C | 0.2012 | 0.0451 | 4.465 | 9.197
(100, 100, 100) | 16 | uint8 | reflect | C | 0.2006 | 0.0411 | 4.886 | 9.073
(100, 100, 100) | 16 | uint8 | wrap | C | 0.1958 | 0.0429 | 4.561 | 9.060
(100, 100, 100) | 2 | uint8 | edge | F | 0.1996 | 0.0630 | 3.167 | 9.158
(100, 100, 100) | 2 | uint8 | symmetric | F | 0.1962 | 0.0636 | 3.084 | 8.816
(100, 100, 100) | 2 | uint8 | reflect | F | 0.1957 | 0.0638 | 3.068 | 8.725
(100, 100, 100) | 2 | uint8 | wrap | F | 0.1908 | 0.0636 | 2.999 | 8.719
(100, 100, 100) | 16 | uint8 | edge | F | 0.2041 | 0.0995 | 2.052 | 9.048
(100, 100, 100) | 16 | uint8 | symmetric | F | 0.2055 | 0.1039 | 1.978 | 8.858
(100, 100, 100) | 16 | uint8 | reflect | F | 0.2040 | 0.1038 | 1.965 | 8.821
(100, 100, 100) | 16 | uint8 | wrap | F | 0.1989 | 0.1071 | 1.858 | 8.757
(256, 256, 256) | 2 | uint8 | edge | C | 0.2063 | 0.1495 | 1.380 | 9.652
(256, 256, 256) | 2 | uint8 | symmetric | C | 0.2065 | 0.1613 | 1.280 | 9.647
(256, 256, 256) | 2 | uint8 | reflect | C | 0.2055 | 0.1540 | 1.334 | 9.328
(256, 256, 256) | 2 | uint8 | wrap | C | 0.1997 | 0.1569 | 1.273 | 9.326
(256, 256, 256) | 16 | uint8 | edge | C | 0.2090 | 0.1973 | 1.060 | 9.704
(256, 256, 256) | 16 | uint8 | symmetric | C | 0.2113 | 0.2419 | 0.873 | 9.573
(256, 256, 256) | 16 | uint8 | reflect | C | 0.2131 | 0.2124 | 1.003 | 9.351
(256, 256, 256) | 16 | uint8 | wrap | C | 0.2076 | 0.2311 | 0.899 | 9.410

Authors:
  - Gregory Lee (https://github.com/grlee77)
  - https://github.com/jakirkham

Approvers:
  - Gigon Bae (https://github.com/gigony)

URL: https://github.com/rapidsai/cucim/pull/482
---
 .../core/operations/morphology/_pba_2d.py     |   3 +-
 .../core/operations/morphology/_pba_3d.py     |   3 +-
 .../src/cucim/skimage/_vendored/__init__.py   |   1 +
 .../_vendored/_ndimage_interpolation.py       |   9 +-
 .../cucim/src/cucim/skimage/_vendored/pad.py  | 813 ++++++++++++++++++
 .../skimage/_vendored/pad_elementwise.py      | 230 +++++
 .../src/cucim/skimage/exposure/_adapthist.py  |  15 +-
 .../src/cucim/skimage/feature/template.py     |   7 +-
 .../cucim/skimage/feature/tests/test_blob.py  |  48 +-
 .../skimage/feature/tests/test_corner.py      |  11 +-
 .../src/cucim/skimage/filters/_fft_based.py   |   3 +-
 .../src/cucim/skimage/filters/_median_hist.py |   3 +-
 .../src/cucim/skimage/filters/_sparse.py      |   3 +-
 .../skimage/filters/tests/test_gaussian.py    |   3 +-
 .../src/cucim/skimage/filters/thresholding.py |   5 +-
 .../src/cucim/skimage/measure/_regionprops.py |   4 +-
 .../skimage/measure/_regionprops_utils.py     |   6 +-
 .../cucim/src/cucim/skimage/measure/block.py  |   5 +-
 .../skimage/measure/tests/test_regionprops.py |   5 +-
 .../cucim/skimage/metrics/simple_metrics.py   |   3 +-
 .../src/cucim/skimage/morphology/gray.py      |   3 +-
 .../cucim/skimage/segmentation/_chan_vese.py  |   5 +-
 .../src/cucim/skimage/transform/_warps.py     |   9 +-
 23 files changed, 1132 insertions(+), 65 deletions(-)
 create mode 100644 python/cucim/src/cucim/skimage/_vendored/pad.py
 create mode 100644 python/cucim/src/cucim/skimage/_vendored/pad_elementwise.py

diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
index 8327aa6cb..85aacd06e 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_2d.py
@@ -5,6 +5,7 @@
 
 import cupy
 
+from cucim.skimage._vendored import pad
 from cucim.skimage._vendored._ndimage_util import _get_inttype
 
 try:
@@ -352,7 +353,7 @@ def _pba_2d(arr, sampling=None, return_distances=True, return_indices=False,
     orig_sy, orig_sx = arr.shape
     padding_width = _determine_padding(arr.shape, padded_size, block_size)
     if padding_width is not None:
-        arr = cupy.pad(arr, padding_width, mode="constant", constant_values=1)
+        arr = pad(arr, padding_width, mode="constant", constant_values=1)
     size = arr.shape[0]
 
     input_arr = _pack_int2(arr, marker=marker, int_dtype=int_dtype)
diff --git a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
index b62706546..f7c572bfe 100644
--- a/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
+++ b/python/cucim/src/cucim/core/operations/morphology/_pba_3d.py
@@ -4,6 +4,7 @@
 import cupy
 import numpy as np
 
+from cucim.skimage._vendored import pad
 from cucim.skimage._vendored._ndimage_util import _get_inttype
 
 from ._pba_2d import (_check_distances, _check_indices,
@@ -366,7 +367,7 @@ def _pba_3d(arr, sampling=None, return_distances=True, return_indices=False,
         arr.shape, block_size, m1, m2, m3, blockx, blocky
     )
     if padding_width is not None:
-        arr = cupy.pad(arr, padding_width, mode="constant", constant_values=1)
+        arr = pad(arr, padding_width, mode="constant", constant_values=1)
     size = arr.shape[0]
 
     # pba algorithm was implemented to use 32-bit integer to store compressed
diff --git a/python/cucim/src/cucim/skimage/_vendored/__init__.py b/python/cucim/src/cucim/skimage/_vendored/__init__.py
index 1f7c85383..8b74f933d 100644
--- a/python/cucim/src/cucim/skimage/_vendored/__init__.py
+++ b/python/cucim/src/cucim/skimage/_vendored/__init__.py
@@ -4,4 +4,5 @@
 """
 
 from cucim.skimage._vendored._pearsonr import pearsonr
+from cucim.skimage._vendored.pad import pad
 from cucim.skimage._vendored.signaltools import *  # noqa
diff --git a/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py
index 18275213b..ab396a0e5 100644
--- a/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py
+++ b/python/cucim/src/cucim/skimage/_vendored/_ndimage_interpolation.py
@@ -11,6 +11,7 @@
 from cucim.skimage._vendored import \
     _ndimage_spline_prefilter_core as _spline_prefilter_core
 from cucim.skimage._vendored import _ndimage_util as _util
+from cucim.skimage._vendored import pad
 from cucim.skimage._vendored._internal import _normalize_axis_index, prod
 
 
@@ -223,7 +224,7 @@ def _prepad_for_spline_filter(input, mode, cval):
             kwargs = dict(mode='constant', constant_values=cval)
         else:
             kwargs = dict(mode='edge')
-        padded = cupy.pad(input, npad, **kwargs)
+        padded = pad(input, npad, **kwargs)
     else:
         npad = 0
         padded = input
@@ -236,7 +237,7 @@ def _filter_input(image, prefilter, mode, cval, order):
     Spline orders > 1 need a prefiltering stage to preserve resolution.
 
     For boundary modes without analytical spline boundary conditions, some
-    prepadding of the input with cupy.pad is used to maintain accuracy.
+    prepadding of the input with pad is used to maintain accuracy.
     ``npad`` is an integer corresponding to the amount of padding at each edge
     of the array.
     """
@@ -289,8 +290,8 @@ def map_coordinates(input, coordinates, output=None, order=3,
     _check_parameter('map_coordinates', order, mode)
 
     if mode == 'opencv' or mode == '_opencv_edge':
-        input = cupy.pad(input, [(1, 1)] * input.ndim, 'constant',
-                         constant_values=cval)
+        input = pad(input, [(1, 1)] * input.ndim, 'constant',
+                    constant_values=cval)
         coordinates = cupy.add(coordinates, 1)
         mode = 'constant'
 
diff --git a/python/cucim/src/cucim/skimage/_vendored/pad.py b/python/cucim/src/cucim/skimage/_vendored/pad.py
new file mode 100644
index 000000000..3d744b431
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/pad.py
@@ -0,0 +1,813 @@
+"""version of cupy.pad that dispatches to elementwise kernels for some boundary
+modes: {'edge', 'wrap', 'symmetric', 'reflect'}.
+
+New utility _use_elementwise_kernel determines when to use the new elementwise
+kernels, otherwise the existing implementations as in cupy.pad are used.
+"""
+
+import numbers
+
+import cupy
+import numpy
+
+###############################################################################
+# Private utility functions.
+
+
+def _round_if_needed(arr, dtype):
+    """Rounds arr inplace if the destination dtype is an integer.
+    """
+    if cupy.issubdtype(dtype, cupy.integer):
+        arr.round(out=arr)  # bug in round so use rint (cupy/cupy#2330)
+
+
+def _slice_at_axis(sl, axis):
+    """Constructs a tuple of slices to slice an array in the given dimension.
+
+    Args:
+      sl(slice): The slice for the given dimension.
+      axis(int): The axis to which `sl` is applied. All other dimensions are
+          left "unsliced".
+
+    Returns:
+      tuple of slices: A tuple with slices matching `shape` in length.
+    """
+    return (slice(None),) * axis + (sl,) + (Ellipsis,)
+
+
+def _view_roi(array, original_area_slice, axis):
+    """Gets a view of the current region of interest during iterative padding.
+
+    When padding multiple dimensions iteratively corner values are
+    unnecessarily overwritten multiple times. This function reduces the
+    working area for the first dimensions so that corners are excluded.
+
+    Args:
+      array(cupy.ndarray): The array with the region of interest.
+      original_area_slice(tuple of slices): Denotes the area with original
+          values of the unpadded array.
+      axis(int): The currently padded dimension assuming that `axis` is padded
+          before `axis` + 1.
+
+    Returns:
+    """
+    axis += 1
+    sl = (slice(None),) * axis + original_area_slice[axis:]
+    return array[sl]
+
+
+def _pad_simple(array, pad_width, fill_value=None):
+    """Pads an array on all sides with either a constant or undefined values.
+
+    Args:
+      array(cupy.ndarray): Array to grow.
+      pad_width(sequence of tuple[int, int]): Pad width on both sides for each
+          dimension in `arr`.
+      fill_value(scalar, optional): If provided the padded area is
+          filled with this value, otherwise the pad area left undefined.
+          (Default value = None)
+    """
+    # Allocate grown array
+    new_shape = tuple(
+        left + size + right
+        for size, (left, right) in zip(array.shape, pad_width)
+    )
+    order = 'F' if array.flags.fnc else 'C'  # Fortran and not also C-order
+    padded = cupy.empty(new_shape, dtype=array.dtype, order=order)
+
+    if fill_value is not None:
+        padded.fill(fill_value)
+
+    # Copy old array into correct space
+    original_area_slice = tuple(
+        slice(left, left + size)
+        for size, (left, right) in zip(array.shape, pad_width)
+    )
+    padded[original_area_slice] = array
+
+    return padded, original_area_slice
+
+
+def _set_pad_area(padded, axis, width_pair, value_pair):
+    """Set an empty-padded area in given dimension.
+    """
+    left_slice = _slice_at_axis(slice(None, width_pair[0]), axis)
+    padded[left_slice] = value_pair[0]
+
+    right_slice = _slice_at_axis(
+        slice(padded.shape[axis] - width_pair[1], None), axis
+    )
+    padded[right_slice] = value_pair[1]
+
+
+def _get_edges(padded, axis, width_pair):
+    """Retrieves edge values from an empty-padded array along a given axis.
+
+    Args:
+      padded(cupy.ndarray): Empty-padded array.
+      axis(int): Dimension in which the edges are considered.
+      width_pair((int, int)): Pair of widths that mark the pad area on both
+          sides in the given dimension.
+    """
+    left_index = width_pair[0]
+    left_slice = _slice_at_axis(slice(left_index, left_index + 1), axis)
+    left_edge = padded[left_slice]
+
+    right_index = padded.shape[axis] - width_pair[1]
+    right_slice = _slice_at_axis(slice(right_index - 1, right_index), axis)
+    right_edge = padded[right_slice]
+
+    return left_edge, right_edge
+
+
+def _get_linear_ramps(padded, axis, width_pair, end_value_pair):
+    """Constructs linear ramps for an empty-padded array along a given axis.
+
+    Args:
+      padded(cupy.ndarray): Empty-padded array.
+      axis(int): Dimension in which the ramps are constructed.
+      width_pair((int, int)): Pair of widths that mark the pad area on both
+          sides in the given dimension.
+      end_value_pair((scalar, scalar)): End values for the linear ramps which
+          form the edge of the fully padded array. These values are included in
+          the linear ramps.
+    """
+    edge_pair = _get_edges(padded, axis, width_pair)
+
+    left_ramp = cupy.linspace(
+        start=end_value_pair[0],
+        # squeeze axis replaced by linspace
+        stop=edge_pair[0].squeeze(axis),
+        num=width_pair[0],
+        endpoint=False,
+        dtype=padded.dtype,
+        axis=axis,
+    )
+
+    right_ramp = cupy.linspace(
+        start=end_value_pair[1],
+        # squeeze axis replaced by linspace
+        stop=edge_pair[1].squeeze(axis),
+        num=width_pair[1],
+        endpoint=False,
+        dtype=padded.dtype,
+        axis=axis,
+    )
+    # Reverse linear space in appropriate dimension
+    right_ramp = right_ramp[_slice_at_axis(slice(None, None, -1), axis)]
+
+    return left_ramp, right_ramp
+
+
+def _get_stats(padded, axis, width_pair, length_pair, stat_func):
+    """Calculates a statistic for an empty-padded array along a given axis.
+
+    Args:
+      padded(cupy.ndarray): Empty-padded array.
+      axis(int): Dimension in which the statistic is calculated.
+      width_pair((int, int)): Pair of widths that mark the pad area on both
+          sides in the given dimension.
+      length_pair(2-element sequence of None or int): Gives the number of
+          values in valid area from each side that is taken into account when
+          calculating the statistic. If None the entire valid area in `padded`
+          is considered.
+      stat_func(function): Function to compute statistic. The expected
+          signature is
+          ``stat_func(x: ndarray, axis: int, keepdims: bool) -> ndarray``.
+    """
+    # Calculate indices of the edges of the area with original values
+    left_index = width_pair[0]
+    right_index = padded.shape[axis] - width_pair[1]
+    # as well as its length
+    max_length = right_index - left_index
+
+    # Limit stat_lengths to max_length
+    left_length, right_length = length_pair
+    if left_length is None or max_length < left_length:
+        left_length = max_length
+    if right_length is None or max_length < right_length:
+        right_length = max_length
+
+    # Calculate statistic for the left side
+    left_slice = _slice_at_axis(
+        slice(left_index, left_index + left_length), axis
+    )
+    left_chunk = padded[left_slice]
+    left_stat = stat_func(left_chunk, axis=axis, keepdims=True)
+    _round_if_needed(left_stat, padded.dtype)
+
+    if left_length == right_length == max_length:
+        # return early as right_stat must be identical to left_stat
+        return left_stat, left_stat
+
+    # Calculate statistic for the right side
+    right_slice = _slice_at_axis(
+        slice(right_index - right_length, right_index), axis
+    )
+    right_chunk = padded[right_slice]
+    right_stat = stat_func(right_chunk, axis=axis, keepdims=True)
+    _round_if_needed(right_stat, padded.dtype)
+    return left_stat, right_stat
+
+
+def _set_reflect_both(padded, axis, width_pair, method, include_edge=False):
+    """Pads an `axis` of `arr` using reflection.
+
+    Args:
+      padded(cupy.ndarray): Input array of arbitrary shape.
+      axis(int): Axis along which to pad `arr`.
+      width_pair((int, int)): Pair of widths that mark the pad area on both
+          sides in the given dimension.
+      method(str): Controls method of reflection; options are 'even' or 'odd'.
+      include_edge(bool, optional): If true, edge value is included in
+          reflection, otherwise the edge value forms the symmetric axis to the
+          reflection. (Default value = False)
+    """
+    left_pad, right_pad = width_pair
+    old_length = padded.shape[axis] - right_pad - left_pad
+
+    if include_edge:
+        # Edge is included, we need to offset the pad amount by 1
+        edge_offset = 1
+    else:
+        edge_offset = 0  # Edge is not included, no need to offset pad amount
+        old_length -= 1  # but must be omitted from the chunk
+
+    if left_pad > 0:
+        # Pad with reflected values on left side:
+        # First limit chunk size which can't be larger than pad area
+        chunk_length = min(old_length, left_pad)
+        # Slice right to left, stop on or next to edge, start relative to stop
+        stop = left_pad - edge_offset
+        start = stop + chunk_length
+        left_slice = _slice_at_axis(slice(start, stop, -1), axis)
+        left_chunk = padded[left_slice]
+
+        if method == 'odd':
+            # Negate chunk and align with edge
+            edge_slice = _slice_at_axis(slice(left_pad, left_pad + 1), axis)
+            left_chunk = 2 * padded[edge_slice] - left_chunk
+
+        # Insert chunk into padded area
+        start = left_pad - chunk_length
+        stop = left_pad
+        pad_area = _slice_at_axis(slice(start, stop), axis)
+        padded[pad_area] = left_chunk
+        # Adjust pointer to left edge for next iteration
+        left_pad -= chunk_length
+
+    if right_pad > 0:
+        # Pad with reflected values on right side:
+        # First limit chunk size which can't be larger than pad area
+        chunk_length = min(old_length, right_pad)
+        # Slice right to left, start on or next to edge, stop relative to start
+        start = -right_pad + edge_offset - 2
+        stop = start - chunk_length
+        right_slice = _slice_at_axis(slice(start, stop, -1), axis)
+        right_chunk = padded[right_slice]
+
+        if method == 'odd':
+            # Negate chunk and align with edge
+            edge_slice = _slice_at_axis(
+                slice(-right_pad - 1, -right_pad), axis
+            )
+            right_chunk = 2 * padded[edge_slice] - right_chunk
+
+        # Insert chunk into padded area
+        start = padded.shape[axis] - right_pad
+        stop = start + chunk_length
+        pad_area = _slice_at_axis(slice(start, stop), axis)
+        padded[pad_area] = right_chunk
+        # Adjust pointer to right edge for next iteration
+        right_pad -= chunk_length
+
+    return left_pad, right_pad
+
+
+def _set_wrap_both(padded, axis, width_pair):
+    """Pads an `axis` of `arr` with wrapped values.
+
+    Args:
+      padded(cupy.ndarray): Input array of arbitrary shape.
+      axis(int): Axis along which to pad `arr`.
+      width_pair((int, int)): Pair of widths that mark the pad area on both
+          sides in the given dimension.
+    """
+    left_pad, right_pad = width_pair
+    period = padded.shape[axis] - right_pad - left_pad
+
+    # If the current dimension of `arr` doesn't contain enough valid values
+    # (not part of the undefined pad area) we need to pad multiple times.
+    # Each time the pad area shrinks on both sides which is communicated with
+    # these variables.
+    new_left_pad = 0
+    new_right_pad = 0
+
+    if left_pad > 0:
+        # Pad with wrapped values on left side
+        # First slice chunk from right side of the non-pad area.
+        # Use min(period, left_pad) to ensure that chunk is not larger than
+        # pad area
+        right_slice = _slice_at_axis(
+            slice(
+                -right_pad - min(period, left_pad),
+                -right_pad if right_pad != 0 else None,
+            ),
+            axis,
+        )
+        right_chunk = padded[right_slice]
+
+        if left_pad > period:
+            # Chunk is smaller than pad area
+            pad_area = _slice_at_axis(slice(left_pad - period, left_pad), axis)
+            new_left_pad = left_pad - period
+        else:
+            # Chunk matches pad area
+            pad_area = _slice_at_axis(slice(None, left_pad), axis)
+        padded[pad_area] = right_chunk
+
+    if right_pad > 0:
+        # Pad with wrapped values on right side
+        # First slice chunk from left side of the non-pad area.
+        # Use min(period, right_pad) to ensure that chunk is not larger than
+        # pad area
+        left_slice = _slice_at_axis(
+            slice(left_pad, left_pad + min(period, right_pad)), axis
+        )
+        left_chunk = padded[left_slice]
+
+        if right_pad > period:
+            # Chunk is smaller than pad area
+            pad_area = _slice_at_axis(
+                slice(-right_pad, -right_pad + period), axis
+            )
+            new_right_pad = right_pad - period
+        else:
+            # Chunk matches pad area
+            pad_area = _slice_at_axis(slice(-right_pad, None), axis)
+        padded[pad_area] = left_chunk
+
+    return new_left_pad, new_right_pad
+
+
+def _as_pairs(x, ndim, as_index=False):
+    """Broadcasts `x` to an array with shape (`ndim`, 2).
+
+    A helper function for `pad` that prepares and validates arguments like
+    `pad_width` for iteration in pairs.
+
+    Args:
+      x(scalar or array-like, optional): The object to broadcast to the shape
+          (`ndim`, 2).
+      ndim(int): Number of pairs the broadcasted `x` will have.
+      as_index(bool, optional): If `x` is not None, try to round each
+          element of `x` to an integer (dtype `cupy.intp`) and ensure every
+          element is positive. (Default value = False)
+
+    Returns:
+      nested iterables, shape (`ndim`, 2): The broadcasted version of `x`.
+    """
+    if x is None:
+        # Pass through None as a special case, otherwise cupy.round(x) fails
+        # with an AttributeError
+        return ((None, None),) * ndim
+    elif isinstance(x, numbers.Number):
+        if as_index:
+            x = round(x)
+        return ((x, x),) * ndim
+
+    x = numpy.array(x)
+    if as_index:
+        x = numpy.asarray(numpy.round(x), dtype=numpy.intp)
+
+    if x.ndim < 3:
+        # Optimization: Possibly use faster paths for cases where `x` has
+        # only 1 or 2 elements. `numpy.broadcast_to` could handle these as well
+        # but is currently slower
+
+        if x.size == 1:
+            # x was supplied as a single value
+            x = x.ravel()  # Ensure x[0] works for x.ndim == 0, 1, 2
+            if as_index and x < 0:
+                raise ValueError("index can't contain negative values")
+            return ((x[0], x[0]),) * ndim
+
+        if x.size == 2 and x.shape != (2, 1):
+            # x was supplied with a single value for each side
+            # but except case when each dimension has a single value
+            # which should be broadcasted to a pair,
+            # e.g. [[1], [2]] -> [[1, 1], [2, 2]] not [[1, 2], [1, 2]]
+            x = x.ravel()  # Ensure x[0], x[1] works
+            if as_index and (x[0] < 0 or x[1] < 0):
+                raise ValueError("index can't contain negative values")
+            return ((x[0], x[1]),) * ndim
+
+    if as_index and x.min() < 0:
+        raise ValueError("index can't contain negative values")
+
+    # Converting the array with `tolist` seems to improve performance
+    # when iterating and indexing the result (see usage in `pad`)
+    x_view = x.view()
+    x_view.shape = (ndim, 2)
+    return x_view.tolist()
+
+
+def _use_elementwise_kernel(arr, mode, kwargs):
+    """Determine if we can use an ElementwiseKernel from pad_elementwise.py"""
+    use_elementwise = False
+    if arr.ndim == 0 or arr.size == 0:
+        return False
+    if mode in ('edge', 'wrap'):
+        use_elementwise = True
+    # elif mode == 'constant':
+    #     # Only a uniform constant is supported in the Elementwise kernel.
+    #     # A per-axis constant is not currently supported.
+    #     return isinstance(kwargs.get('constant_values', 0), numbers.Number)
+    elif mode in ('symmetric', 'reflect'):
+        # only the default 'even' reflect type is supported
+        use_elementwise = kwargs.get('reflect_type', 'even') == 'even'
+    if use_elementwise:
+        if (arr.ndim > 2 and (arr.flags.fnc and arr.nbytes > 5_000_000)):
+            # Empirically found slower performance for large Fortran-ordered
+            # arrays with ndim > 2.
+            return False
+        return True
+    return False
+
+
+###############################################################################
+# Public functions
+
+
+# @array_function_dispatch(_pad_dispatcher, module='numpy')
+def pad(array, pad_width, mode='constant', **kwargs):
+    """Pads an array with specified widths and values.
+
+    Args:
+      array(cupy.ndarray): The array to pad.
+      pad_width(sequence, array_like or int): Number of values padded to the
+          edges of each axis. ((before_1, after_1), ... (before_N, after_N))
+          unique pad widths for each axis. ((before, after),) yields same
+          before and after pad for each axis. (pad,) or int is a shortcut for
+          before = after = pad width for all axes. You cannot specify
+          ``cupy.ndarray``.
+      mode(str or function, optional): One of the following string values or a
+          user supplied function
+
+          'constant' (default)
+              Pads with a constant value.
+          'edge'
+              Pads with the edge values of array.
+          'linear_ramp'
+              Pads with the linear ramp between end_value and the array edge
+              value.
+          'maximum'
+              Pads with the maximum value of all or part of the vector along
+              each axis.
+          'mean'
+              Pads with the mean value of all or part of the vector along each
+              axis.
+          'median'
+              Pads with the median value of all or part of the vector along
+              each axis. (Not Implemented)
+          'minimum'
+              Pads with the minimum value of all or part of the vector along
+              each axis.
+          'reflect'
+              Pads with the reflection of the vector mirrored on the first and
+              last values of the vector along each axis.
+          'symmetric'
+               Pads with the reflection of the vector mirrored along the edge
+               of the array.
+          'wrap'
+              Pads with the wrap of the vector along the axis. The first
+              values are used to pad the end and the end values are used to
+              pad the beginning.
+          'empty'
+              Pads with undefined values.
+          <function>
+              Padding function, see Notes.
+      stat_length(sequence or int, optional): Used in 'maximum', 'mean',
+          'median', and 'minimum'.  Number of values at edge of each axis used
+          to calculate the statistic value.
+          ((before_1, after_1), ... (before_N, after_N)) unique statistic
+          lengths for each axis. ((before, after),) yields same before and
+          after statistic lengths for each axis. (stat_length,) or int is a
+          shortcut for before = after = statistic length for all axes.
+          Default is ``None``, to use the entire axis. You cannot specify
+          ``cupy.ndarray``.
+      constant_values(sequence or scalar, optional): Used in 'constant'. The
+          values to set the padded values for each axis.
+          ((before_1, after_1), ... (before_N, after_N)) unique pad constants
+          for each axis.
+          ((before, after),) yields same before and after constants for each
+          axis.
+          (constant,) or constant is a shortcut for before = after = constant
+          for all axes.
+          Default is 0. You cannot specify ``cupy.ndarray``.
+      end_values(sequence or scalar, optional): Used in 'linear_ramp'. The
+          values used for the ending value of the linear_ramp and that will
+          form the edge of the padded array.
+          ((before_1, after_1), ... (before_N, after_N)) unique end values
+          for each axis.
+          ((before, after),) yields same before and after end
+          values for each axis.
+          (constant,) or constant is a shortcut for before = after = constant
+          for all axes.
+          Default is 0. You cannot specify ``cupy.ndarray``.
+      reflect_type({'even', 'odd'}, optional): Used in 'reflect', and
+          'symmetric'.  The 'even' style is the default with an unaltered
+          reflection around the edge value.  For the 'odd' style, the extended
+          part of the array is created by subtracting the reflected values from
+          two times the edge value.
+
+    Returns:
+      cupy.ndarray: Padded array with shape extended by ``pad_width``.
+
+    .. note::
+        For an array with rank greater than 1, some of the padding of later
+        axes is calculated from padding of previous axes.  This is easiest to
+        think about with a rank 2 array where the corners of the padded array
+        are calculated by using padded values from the first axis.
+
+        The padding function, if used, should modify a rank 1 array in-place.
+        It has the following signature:
+
+        ``padding_func(vector, iaxis_pad_width, iaxis, kwargs)``
+
+        where
+
+        vector (cupy.ndarray)
+            A rank 1 array already padded with zeros.  Padded values are
+            ``vector[:iaxis_pad_width[0]]`` and
+            ``vector[-iaxis_pad_width[1]:]``.
+        iaxis_pad_width (tuple)
+            A 2-tuple of ints, ``iaxis_pad_width[0]`` represents the number of
+            values padded at the beginning of vector where
+            ``iaxis_pad_width[1]`` represents the number of values padded at
+            the end of vector.
+        iaxis (int)
+            The axis currently being calculated.
+        kwargs (dict)
+            Any keyword arguments the function requires.
+
+    Examples
+    --------
+    >>> a = cupy.array([1, 2, 3, 4, 5])
+    >>> cupy.pad(a, (2, 3), 'constant', constant_values=(4, 6))
+    array([4, 4, 1, ..., 6, 6, 6])
+
+    >>> cupy.pad(a, (2, 3), 'edge')
+    array([1, 1, 1, ..., 5, 5, 5])
+
+    >>> cupy.pad(a, (2, 3), 'linear_ramp', end_values=(5, -4))
+    array([ 5,  3,  1,  2,  3,  4,  5,  2, -1, -4])
+
+    >>> cupy.pad(a, (2,), 'maximum')
+    array([5, 5, 1, 2, 3, 4, 5, 5, 5])
+
+    >>> cupy.pad(a, (2,), 'mean')
+    array([3, 3, 1, 2, 3, 4, 5, 3, 3])
+
+    >>> a = cupy.array([[1, 2], [3, 4]])
+    >>> cupy.pad(a, ((3, 2), (2, 3)), 'minimum')
+    array([[1, 1, 1, 2, 1, 1, 1],
+           [1, 1, 1, 2, 1, 1, 1],
+           [1, 1, 1, 2, 1, 1, 1],
+           [1, 1, 1, 2, 1, 1, 1],
+           [3, 3, 3, 4, 3, 3, 3],
+           [1, 1, 1, 2, 1, 1, 1],
+           [1, 1, 1, 2, 1, 1, 1]])
+
+    >>> a = cupy.array([1, 2, 3, 4, 5])
+    >>> cupy.pad(a, (2, 3), 'reflect')
+    array([3, 2, 1, 2, 3, 4, 5, 4, 3, 2])
+
+    >>> cupy.pad(a, (2, 3), 'reflect', reflect_type='odd')
+    array([-1,  0,  1,  2,  3,  4,  5,  6,  7,  8])
+
+    >>> cupy.pad(a, (2, 3), 'symmetric')
+    array([2, 1, 1, 2, 3, 4, 5, 5, 4, 3])
+
+    >>> cupy.pad(a, (2, 3), 'symmetric', reflect_type='odd')
+    array([0, 1, 1, 2, 3, 4, 5, 5, 6, 7])
+
+    >>> cupy.pad(a, (2, 3), 'wrap')
+    array([4, 5, 1, 2, 3, 4, 5, 1, 2, 3])
+
+    >>> def pad_with(vector, pad_width, iaxis, kwargs):
+    ...     pad_value = kwargs.get('padder', 10)
+    ...     vector[:pad_width[0]] = pad_value
+    ...     vector[-pad_width[1]:] = pad_value
+    >>> a = cupy.arange(6)
+    >>> a = a.reshape((2, 3))
+    >>> cupy.pad(a, 2, pad_with)
+    array([[10, 10, 10, 10, 10, 10, 10],
+           [10, 10, 10, 10, 10, 10, 10],
+           [10, 10,  0,  1,  2, 10, 10],
+           [10, 10,  3,  4,  5, 10, 10],
+           [10, 10, 10, 10, 10, 10, 10],
+           [10, 10, 10, 10, 10, 10, 10]])
+    >>> cupy.pad(a, 2, pad_with, padder=100)
+    array([[100, 100, 100, 100, 100, 100, 100],
+           [100, 100, 100, 100, 100, 100, 100],
+           [100, 100,   0,   1,   2, 100, 100],
+           [100, 100,   3,   4,   5, 100, 100],
+           [100, 100, 100, 100, 100, 100, 100],
+           [100, 100, 100, 100, 100, 100, 100]])
+    """
+    if isinstance(pad_width, numbers.Integral):
+        pad_width = ((pad_width, pad_width),) * array.ndim
+    else:
+        pad_width = numpy.asarray(pad_width)
+
+        if not pad_width.dtype.kind == 'i':
+            raise TypeError('`pad_width` must be of integral type.')
+
+        # Broadcast to shape (array.ndim, 2)
+        pad_width = _as_pairs(pad_width, array.ndim, as_index=True)
+
+    if callable(mode):
+        # Old behavior: Use user-supplied function with numpy.apply_along_axis
+        function = mode
+        # Create a new zero padded array
+        padded, _ = _pad_simple(array, pad_width, fill_value=0)
+        # And apply along each axis
+
+        for axis in range(padded.ndim):
+            # Iterate using ndindex as in apply_along_axis, but assuming that
+            # function operates inplace on the padded array.
+
+            # view with the iteration axis at the end
+            view = cupy.moveaxis(padded, axis, -1)
+
+            # compute indices for the iteration axes, and append a trailing
+            # ellipsis to prevent 0d arrays decaying to scalars (gh-8642)
+            inds = numpy.ndindex(view.shape[:-1])
+            inds = (ind + (Ellipsis,) for ind in inds)
+            for ind in inds:
+                function(view[ind], pad_width[axis], axis, kwargs)
+
+        return padded
+
+    # Make sure that no unsupported keywords were passed for the current mode
+    allowed_kwargs = {
+        'empty': [],
+        'edge': [],
+        'wrap': [],
+        'constant': ['constant_values'],
+        'linear_ramp': ['end_values'],
+        'maximum': ['stat_length'],
+        'mean': ['stat_length'],
+        # 'median': ['stat_length'],
+        'minimum': ['stat_length'],
+        'reflect': ['reflect_type'],
+        'symmetric': ['reflect_type'],
+    }
+    try:
+        unsupported_kwargs = set(kwargs) - set(allowed_kwargs[mode])
+    except KeyError:
+        raise ValueError("mode '{}' is not supported".format(mode))
+    if unsupported_kwargs:
+        raise ValueError(
+            "unsupported keyword arguments for mode '{}': {}".format(
+                mode, unsupported_kwargs
+            )
+        )
+
+    if _use_elementwise_kernel(array, mode, kwargs):
+        # import here to avoid circular import
+        from cucim.skimage._vendored.pad_elementwise import _get_pad_kernel
+
+        if mode == 'reflect' and min(array.shape) > 1:
+            mode = 'reflect_no_singleton_dim'
+
+        if not array.flags.forc:
+            # make non-contiguous input C-contiguous
+            array = cupy.ascontiguousarray(array)
+
+        # Allocate grown array
+        new_shape = tuple(
+            left + size + right
+            for size, (left, right) in zip(array.shape, pad_width)
+        )
+        order = 'F' if array.flags.fnc else 'C'  # Fortran and not also C-order
+        padded = cupy.empty(new_shape, dtype=array.dtype, order=order)
+
+        (int_type, np_type) = (('int', cupy.int32) if padded.size < (1 << 31)
+                               else ('ptrdiff_t', cupy.intp))
+        kern = _get_pad_kernel(
+            pad_starts=tuple(p[0] for p in pad_width),
+            mode=mode,
+            int_type=int_type,
+            order=order,
+        )
+        # pad_width must be C-contiguous
+        if mode == 'constant':
+            # `_use_elementwise_kernel` excludes cases with non-scalar cval
+            cval = float(kwargs.get('constant_values', 0))
+            kern(array, cval, padded, size=padded.size)
+        else:
+            kern(array, padded, size=padded.size)
+        return padded
+
+    if mode == 'constant':
+        values = kwargs.get('constant_values', 0)
+        if isinstance(values, numbers.Number) and values == 0 and (
+                array.ndim == 1 or array.size < 4e6):
+            # faster path for 1d arrays or small n-dimensional arrays
+            return _pad_simple(array, pad_width, 0)[0]
+
+    stat_functions = {
+        'maximum': cupy.max,
+        'minimum': cupy.min,
+        'mean': cupy.mean,
+        # 'median': cupy.median,
+    }
+
+    # Create array with final shape and original values
+    # (padded area is undefined)
+    padded, original_area_slice = _pad_simple(array, pad_width)
+    # And prepare iteration over all dimensions
+    # (zipping may be more readable than using enumerate)
+    axes = range(padded.ndim)
+
+    if mode == 'constant':
+        values = _as_pairs(values, padded.ndim)
+        for axis, width_pair, value_pair in zip(axes, pad_width, values):
+            roi = _view_roi(padded, original_area_slice, axis)
+            _set_pad_area(roi, axis, width_pair, value_pair)
+
+    elif mode == 'empty':
+        pass  # Do nothing as _pad_simple already returned the correct result
+
+    elif array.size == 0:
+        # Only modes 'constant' and 'empty' can extend empty axes, all other
+        # modes depend on `array` not being empty
+        # -> ensure every empty axis is only 'padded with 0'
+        for axis, width_pair in zip(axes, pad_width):
+            if array.shape[axis] == 0 and any(width_pair):
+                raise ValueError(
+                    "can't extend empty axis {} using modes other than "
+                    "'constant' or 'empty'".format(axis)
+                )
+        # passed, don't need to do anything more as _pad_simple already
+        # returned the correct result
+
+    elif mode == 'edge':
+        for axis, width_pair in zip(axes, pad_width):
+            roi = _view_roi(padded, original_area_slice, axis)
+            edge_pair = _get_edges(roi, axis, width_pair)
+            _set_pad_area(roi, axis, width_pair, edge_pair)
+
+    elif mode == 'linear_ramp':
+        end_values = kwargs.get('end_values', 0)
+        end_values = _as_pairs(end_values, padded.ndim)
+        for axis, width_pair, value_pair in zip(axes, pad_width, end_values):
+            roi = _view_roi(padded, original_area_slice, axis)
+            ramp_pair = _get_linear_ramps(roi, axis, width_pair, value_pair)
+            _set_pad_area(roi, axis, width_pair, ramp_pair)
+
+    elif mode in stat_functions:
+        func = stat_functions[mode]
+        length = kwargs.get('stat_length', None)
+        length = _as_pairs(length, padded.ndim, as_index=True)
+        for axis, width_pair, length_pair in zip(axes, pad_width, length):
+            roi = _view_roi(padded, original_area_slice, axis)
+            stat_pair = _get_stats(roi, axis, width_pair, length_pair, func)
+            _set_pad_area(roi, axis, width_pair, stat_pair)
+
+    elif mode in {'reflect', 'symmetric'}:
+        method = kwargs.get('reflect_type', 'even')
+        include_edge = True if mode == 'symmetric' else False
+        for axis, (left_index, right_index) in zip(axes, pad_width):
+            if array.shape[axis] == 1 and (left_index > 0 or right_index > 0):
+                # Extending singleton dimension for 'reflect' is legacy
+                # behavior; it really should raise an error.
+                edge_pair = _get_edges(padded, axis, (left_index, right_index))
+                _set_pad_area(
+                    padded, axis, (left_index, right_index), edge_pair
+                )
+                continue
+
+            roi = _view_roi(padded, original_area_slice, axis)
+            while left_index > 0 or right_index > 0:
+                # Iteratively pad until dimension is filled with reflected
+                # values. This is necessary if the pad area is larger than
+                # the length of the original values in the current dimension.
+                left_index, right_index = _set_reflect_both(
+                    roi, axis, (left_index, right_index), method, include_edge
+                )
+
+    elif mode == 'wrap':
+        for axis, (left_index, right_index) in zip(axes, pad_width):
+            roi = _view_roi(padded, original_area_slice, axis)
+            while left_index > 0 or right_index > 0:
+                # Iteratively pad until dimension is filled with wrapped
+                # values. This is necessary if the pad area is larger than
+                # the length of the original values in the current dimension.
+                left_index, right_index = _set_wrap_both(
+                    roi, axis, (left_index, right_index)
+                )
+
+    return padded
diff --git a/python/cucim/src/cucim/skimage/_vendored/pad_elementwise.py b/python/cucim/src/cucim/skimage/_vendored/pad_elementwise.py
new file mode 100644
index 000000000..9b58c2fe0
--- /dev/null
+++ b/python/cucim/src/cucim/skimage/_vendored/pad_elementwise.py
@@ -0,0 +1,230 @@
+import cupy
+
+
+def _pad_boundary_ops(mode, var_name, size, int_t="int", no_singleton=False):
+    T = 'int' if int_t == 'int' else 'long long'
+    min_func = 'min'
+    max_func = 'max'
+    if mode == 'constant':
+        ops = f'''
+        if (({var_name} < 0) || {var_name} >= {size}) {{
+            {var_name} = -1;
+        }}'''
+    elif mode == 'symmetric':
+        ops = f'''
+            if ({var_name} < 0) {{
+                {var_name} = - 1 -{var_name};
+            }}
+            {var_name} %= {size} * 2;
+            {var_name} = {min_func}({var_name}, 2 * {size} - 1 - {var_name});
+        '''
+    elif mode == 'reflect':
+        ops = f'''
+        if ({size} == 1) {{
+            {var_name} = 0;
+        }} else {{
+            if ({var_name} < 0) {{
+                {var_name} = -{var_name};
+            }}
+            if ({var_name} >= {size}) {{
+                {var_name} = 1 + ({var_name} - 1) % (({size} - 1) * 2);
+                {var_name} = {min_func}({var_name},
+                                        2 * {size} - 2 - {var_name});
+            }}
+        }}'''  # noqa
+    elif mode == 'reflect_no_singleton_dim':
+        # the same as reflect, but without the extra `{size} == 1` check
+        ops = f'''
+        if ({var_name} < 0) {{
+            {var_name} = -{var_name};
+        }}
+        if ({var_name} >= {size}) {{
+            {var_name} = 1 + ({var_name} - 1) % (({size} - 1) * 2);
+            {var_name} = {min_func}({var_name}, 2 * {size} - 2 - {var_name});
+        }}
+        '''
+    elif mode == 'edge':
+        ops = f'''
+        {var_name} = {min_func}(
+            {max_func}(static_cast<{T}>({var_name}), static_cast<{T}>(0)),
+            static_cast<{T}>({size} - 1));
+        '''
+    elif mode == 'wrap':
+        ops = f'''
+        {var_name} %= {size};
+        if ({var_name} < 0) {{
+            {var_name} += {size};
+        }}
+        '''
+    return ops + "\n"
+
+
+def _generate_size_vars(
+    ndim, arr_name='arr', size_prefix='size', int_type='int'
+):
+    """Store shape of a raw array into individual variables.
+
+    Examples
+    --------
+    >>> print(_generate_size_vars(3, 'arr', 'size', 'int'))
+    int size_0 = arr.shape()[0];
+    int size_1 = arr.shape()[1];
+    int size_2 = arr.shape()[2];
+    """
+    set_size_vars = [f'{int_type} {size_prefix}_{i} = {arr_name}.shape()[{i}];'
+                     for i in range(ndim)]
+    return '\n'.join(set_size_vars) + '\n'
+
+
+def _generate_stride_vars(
+    ndim, arr_name='arr', size_prefix='stride', int_type='int'
+):
+    """Store stride (in bytes) of a raw array into individual variables.
+
+    Examples
+    --------
+    >>> print(_generate_size_vars(3, 'arr', 'size', 'int'))
+    int stride_0 = arr.strides()[0];
+    int stride_1 = arr.strides()[1];
+    int stride_2 = arr.strides()[2];
+    """
+    set_size_vars = [
+        f'{int_type} {size_prefix}_{i} = {arr_name}.strides()[{i}];'
+        for i in range(ndim)
+    ]
+    return '\n'.join(set_size_vars) + '\n'
+
+
+def _generate_indices_ops(
+    ndim, size_prefix='size', int_type='int', index_prefix='ind', order='C',
+):
+    """Generate indices based existing variables.
+
+    Assumes variables f'{size_prefix}_{i}' has the size along axis, i.
+
+    Examples
+    --------
+    >>> print(_generate_indices_ops(3, 'size', 'int', 'ind', 'C'))
+    int _i = i;
+    int ind_2 = _i % size_2; _i /= size_2;
+    int ind_1 = _i % size_1; _i /= size_1;
+    int ind_0 = _i;
+    """
+    if order == 'C':
+        _range = range(ndim - 1, 0, -1)
+        idx_largest_stride = 0
+    elif order == 'F':
+        _range = range(ndim - 1)
+        idx_largest_stride = ndim - 1
+    else:
+        raise ValueError(f"Unknown order: {order}. Must be one of {'C', 'F'}.")
+    body = [f'{int_type} {index_prefix}_{j} = _i % {size_prefix}_{j}; _i /= {size_prefix}_{j};'  # noqa
+            for j in _range]
+    body = '\n'.join(body)
+    code = f'{int_type} _i = i;\n'
+    code += body + '\n'
+    code += f'{int_type} {index_prefix}_{idx_largest_stride} = _i;\n'
+    return code
+
+
+def _gen_raveled(ndim, stride_prefix='stride', index_prefix='i', order=None):
+    """Generate raveled index for c-ordered memory layout
+
+    For index_prefix='i', the indices are (i_0, i_1, ....)
+    For stride_prefix='stride', the stride is (stride_0, stride_1, ....)
+    """
+    return ' + '.join(
+        f'{stride_prefix}_{j} * {index_prefix}_{j}' for j in range(ndim)
+    )
+
+
+def _get_pad_kernel_code(pad_starts, int_type='int', mode='edge', order='C'):
+    # variables storing shape of the output array
+    ndim = len(pad_starts)
+    out_size_prefix = 'shape'
+    operation = _generate_size_vars(
+        ndim, arr_name='out', size_prefix=out_size_prefix, int_type=int_type
+    )
+
+    # variables storing shape of the input array
+    in_size_prefix = 'ishape'
+    in_stride_prefix = 'istride'
+    operation += _generate_size_vars(
+        ndim, arr_name='arr', size_prefix=in_size_prefix, int_type=int_type
+    )
+    operation += _generate_stride_vars(
+        ndim, arr_name='arr', size_prefix=in_stride_prefix, int_type=int_type
+    )
+
+    # unraveled indices into the output array
+    out_index_prefix = 'oi'
+    # Note: Regardless of actual memory layout, need order='C' here to match
+    #       the behavior of the index raveling used by ElementwiseKernel.
+    operation += _generate_indices_ops(
+        ndim, size_prefix=out_size_prefix, int_type=int_type,
+        index_prefix=out_index_prefix, order='C'
+    )
+
+    # compute unraveled indices into the input array
+    # (i_0, i_1, ...)
+    in_index_prefix = 'i'
+    operation += '\n'.join(
+        [f'{int_type} {in_index_prefix}_{j} = {out_index_prefix}_{j} - {pad_starts[j]};'  # noqa
+         for j in range(ndim)]
+    )
+    operation += '\n'
+    input_indices = tuple(f'{in_index_prefix}_{j}' for j in range(ndim))
+
+    # impose boundary condition
+
+    range_cond = " || ".join(
+        f"({coord} < 0) || ({coord} >= {in_size_prefix}_{j})"
+        for j, coord in enumerate(input_indices)
+    )
+    operation += f"bool range_cond = {range_cond};"
+    operation += "if (range_cond) {\n"
+    if mode == "constant":
+        for j, coord in enumerate(input_indices):
+            operation += _pad_boundary_ops(
+                mode, coord, f"{in_size_prefix}_{j}", int_type
+            )
+            operation += f"""
+                if ({coord} == -1) {{
+                    out[i] = static_cast<F>(cval);
+                    return;
+                }}
+            """
+    else:
+        for j, coord in enumerate(input_indices):
+            operation += _pad_boundary_ops(
+                mode, coord, f"{in_size_prefix}_{j}", int_type
+            )
+    operation += "}\n"
+
+    raveled_idx = _gen_raveled(
+        ndim, stride_prefix=in_stride_prefix, index_prefix=in_index_prefix,
+        order=order,
+    )
+    operation += f"""
+    // set output based on raveled index into the input array
+    const char* char_arr = reinterpret_cast<const char*>(&arr[0]);
+    out[i] = *reinterpret_cast<const F*>(char_arr + {raveled_idx});
+    """
+    return operation
+
+
+@cupy._util.memoize(for_each_device=True)
+def _get_pad_kernel(pad_starts, int_type='int', mode='edge', order='C'):
+    in_params = "raw F arr"
+    if mode == 'constant':
+        in_params += ", float64 cval"
+
+    kernel_name = f"pad_{len(pad_starts)}d_order{order}_{mode}"
+    if int_type != "int":
+        kernel_name += f"_{int_type.replace(' ', '_')}_idx"
+
+    return cupy.ElementwiseKernel(
+        in_params=in_params,
+        out_params="raw F out",
+        operation=_get_pad_kernel_code(pad_starts, int_type, mode, order),
+        name=kernel_name)
diff --git a/python/cucim/src/cucim/skimage/exposure/_adapthist.py b/python/cucim/src/cucim/skimage/exposure/_adapthist.py
index d9dbf10c2..4d42e2ae3 100644
--- a/python/cucim/src/cucim/skimage/exposure/_adapthist.py
+++ b/python/cucim/src/cucim/skimage/exposure/_adapthist.py
@@ -27,6 +27,7 @@
 from cucim.skimage.exposure.exposure import rescale_intensity
 
 from .._shared.utils import _supported_float_type
+from .._vendored import pad
 from ..color.adapt_rgb import adapt_rgb, hsv_value
 from ..util import img_as_uint
 
@@ -141,9 +142,11 @@ def _clahe(image, kernel_size, clip_limit, nbins):
     pad_end_per_dim = [(k - s % k) % k + math.ceil(k / 2.)
                        for k, s in zip(kernel_size, image.shape)]
 
-    image = cp.pad(image, [[p_i, p_f] for p_i, p_f in
-                           zip(pad_start_per_dim, pad_end_per_dim)],
-                   mode='reflect')
+    image = pad(
+        image,
+        [[p_i, p_f] for p_i, p_f in zip(pad_start_per_dim, pad_end_per_dim)],
+        mode='reflect',
+    )
 
     # determine gray value bins
     bin_size = 1 + NR_OF_GRAY // nbins
@@ -194,9 +197,9 @@ def _clahe(image, kernel_size, clip_limit, nbins):
     hist = hist.reshape(hist_block_assembled_shape[:ndim] + (-1,))
 
     # duplicate leading mappings in each dim
-    map_array = cp.pad(hist,
-                       [(1, 1) for _ in range(ndim)] + [(0, 0)],
-                       mode='edge')
+    map_array = pad(
+        hist, [(1, 1) for _ in range(ndim)] + [(0, 0)], mode='edge'
+    )
 
     # Perform multilinear interpolation of graylevel mappings
     # using the convention described here:
diff --git a/python/cucim/src/cucim/skimage/feature/template.py b/python/cucim/src/cucim/skimage/feature/template.py
index ce3443c27..3eca3e266 100644
--- a/python/cucim/src/cucim/skimage/feature/template.py
+++ b/python/cucim/src/cucim/skimage/feature/template.py
@@ -3,6 +3,7 @@
 from cucim import _misc
 
 from .._shared.utils import _supported_float_type, check_nD
+from .._vendored import pad
 
 # from cupyx.scipy import signal
 
@@ -141,10 +142,10 @@ def match_template(image, template, pad_input=False, mode='constant',
 
     pad_width = tuple((width, width) for width in template.shape)
     if mode == 'constant':
-        image = cp.pad(image, pad_width=pad_width, mode=mode,
-                       constant_values=constant_values)
+        image = pad(image, pad_width=pad_width, mode=mode,
+                    constant_values=constant_values)
     else:
-        image = cp.pad(image, pad_width=pad_width, mode=mode)
+        image = pad(image, pad_width=pad_width, mode=mode)
 
     # Use special case for 2-D images for much better performance in
     # computation of integral images
diff --git a/python/cucim/src/cucim/skimage/feature/tests/test_blob.py b/python/cucim/src/cucim/skimage/feature/tests/test_blob.py
index 09346f6ad..91683111f 100644
--- a/python/cucim/src/cucim/skimage/feature/tests/test_blob.py
+++ b/python/cucim/src/cucim/skimage/feature/tests/test_blob.py
@@ -6,6 +6,7 @@
 from skimage.draw.draw3d import ellipsoid
 
 from cucim.skimage import feature
+from cucim.skimage._vendored import pad
 from cucim.skimage.feature import blob_dog, blob_doh, blob_log
 
 
@@ -77,9 +78,9 @@ def radius(x):
 def test_blob_dog_3d(dtype, threshold_type):
     # Testing 3D
     r = 10
-    pad = 10
+    pad_width = 10
     im3 = cp.asarray(ellipsoid(r, r, r))
-    im3 = cp.pad(im3, pad, mode='constant')
+    im3 = pad(im3, pad_width, mode='constant')
 
     if threshold_type == 'absolute':
         threshold = 0.001
@@ -99,9 +100,9 @@ def test_blob_dog_3d(dtype, threshold_type):
     b = blobs[0]
 
     assert b.shape == (4,)
-    assert b[0] == r + pad + 1
-    assert b[1] == r + pad + 1
-    assert b[2] == r + pad + 1
+    assert b[0] == r + pad_width + 1
+    assert b[1] == r + pad_width + 1
+    assert b[2] == r + pad_width + 1
     assert abs(math.sqrt(3) * b[3] - r) < 1.1
 
 
@@ -112,9 +113,9 @@ def test_blob_dog_3d(dtype, threshold_type):
 def test_blob_dog_3d_anisotropic(dtype, threshold_type):
     # Testing 3D anisotropic
     r = 10
-    pad = 10
+    pad_width = 10
     im3 = cp.asarray(ellipsoid(r / 2, r, r))
-    im3 = cp.pad(im3, pad, mode='constant')
+    im3 = pad(im3, pad_width, mode='constant')
 
     if threshold_type == 'absolute':
         threshold = 0.001
@@ -134,9 +135,9 @@ def test_blob_dog_3d_anisotropic(dtype, threshold_type):
     b = blobs[0]
 
     assert b.shape == (6,)
-    assert b[0] == r / 2 + pad + 1
-    assert b[1] == r + pad + 1
-    assert b[2] == r + pad + 1
+    assert b[0] == r / 2 + pad_width + 1
+    assert b[1] == r + pad_width + 1
+    assert b[2] == r + pad_width + 1
     assert abs(math.sqrt(3) * b[3] - r / 2) < 1.1
     assert abs(math.sqrt(3) * b[4] - r) < 1.1
     assert abs(math.sqrt(3) * b[5] - r) < 1.1
@@ -294,26 +295,26 @@ def test_blob_log_no_warnings():
 def test_blob_log_3d():
     # Testing 3D
     r = 6
-    pad = 10
+    pad_width = 10
     im3 = cp.asarray(ellipsoid(r, r, r))
-    im3 = cp.pad(im3, pad, mode='constant')
+    im3 = pad(im3, pad_width, mode='constant')
 
     blobs = blob_log(im3, min_sigma=3, max_sigma=10)
     b = blobs[0]
 
     assert b.shape == (4,)
-    assert b[0] == r + pad + 1
-    assert b[1] == r + pad + 1
-    assert b[2] == r + pad + 1
+    assert b[0] == r + pad_width + 1
+    assert b[1] == r + pad_width + 1
+    assert b[2] == r + pad_width + 1
     assert abs(math.sqrt(3) * b[3] - r) < 1
 
 
 def test_blob_log_3d_anisotropic():
     # Testing 3D anisotropic
     r = 6
-    pad = 10
+    pad_width = 10
     im3 = cp.asarray(ellipsoid(r / 2, r, r))
-    im3 = cp.pad(im3, pad, mode='constant')
+    im3 = pad(im3, pad_width, mode='constant')
 
     blobs = blob_log(
         im3,
@@ -323,9 +324,9 @@ def test_blob_log_3d_anisotropic():
 
     b = blobs[0]
     assert b.shape == (6,)
-    assert b[0] == r / 2 + pad + 1
-    assert b[1] == r + pad + 1
-    assert b[2] == r + pad + 1
+    assert b[0] == r / 2 + pad_width + 1
+    assert b[1] == r + pad_width + 1
+    assert b[2] == r + pad_width + 1
     assert abs(math.sqrt(3) * b[3] - r / 2) < 1
     assert abs(math.sqrt(3) * b[4] - r) < 1
     assert abs(math.sqrt(3) * b[5] - r) < 1
@@ -501,11 +502,10 @@ def test_blob_log_overlap_3d():
     r1, r2 = 7, 6
     pad1, pad2 = 11, 12
     blob1 = cp.asarray(ellipsoid(r1, r1, r1))
-    blob1 = cp.pad(blob1, pad1, mode='constant')
+    blob1 = pad(blob1, pad1, mode='constant')
     blob2 = cp.asarray(ellipsoid(r2, r2, r2))
-    blob2 = cp.pad(blob2, [(pad2, pad2), (pad2 - 9, pad2 + 9),
-                           (pad2, pad2)],
-                   mode='constant')
+    blob2 = pad(blob2, [(pad2, pad2), (pad2 - 9, pad2 + 9), (pad2, pad2)],
+                mode='constant')
     im3 = cp.logical_or(blob1, blob2)
 
     blobs = blob_log(im3, min_sigma=2, max_sigma=10, overlap=0.1)
diff --git a/python/cucim/src/cucim/skimage/feature/tests/test_corner.py b/python/cucim/src/cucim/skimage/feature/tests/test_corner.py
index f2437b0d7..dc752ecd7 100644
--- a/python/cucim/src/cucim/skimage/feature/tests/test_corner.py
+++ b/python/cucim/src/cucim/skimage/feature/tests/test_corner.py
@@ -9,6 +9,7 @@
 from cucim.skimage import img_as_float
 from cucim.skimage._shared._warnings import expected_warnings
 from cucim.skimage._shared.utils import _supported_float_type
+from cucim.skimage._vendored import pad
 from cucim.skimage.color import rgb2gray
 from cucim.skimage.feature import (corner_foerstner, corner_harris,
                                    corner_kitchen_rosenfeld, corner_peaks,
@@ -24,9 +25,9 @@
 @pytest.fixture
 def im3d():
     r = 10
-    pad = 10
+    pad_width = 10
     im3 = draw.ellipsoid(r, r, r)
-    im3 = np.pad(im3, pad, mode='constant').astype(np.uint8)
+    im3 = np.pad(im3, pad_width, mode='constant').astype(np.uint8)
     return cp.asarray(im3)
 
 
@@ -226,9 +227,9 @@ def test_structure_tensor_eigenvalues(dtype):
 
 
 def test_structure_tensor_eigenvalues_3d():
-    image = cp.pad(cube(9), 5, mode='constant') * 1000
-    boundary = (cp.pad(cube(9), 5, mode='constant')
-                - cp.pad(cube(7), 6, mode='constant')).astype(bool)
+    image = pad(cube(9), 5, mode='constant') * 1000
+    boundary = (pad(cube(9), 5, mode='constant')
+                - pad(cube(7), 6, mode='constant')).astype(bool)
     A_elems = structure_tensor(image, sigma=0.1)
     e0, e1, e2 = structure_tensor_eigenvalues(A_elems)
     # e0 should detect facets
diff --git a/python/cucim/src/cucim/skimage/filters/_fft_based.py b/python/cucim/src/cucim/skimage/filters/_fft_based.py
index d54777610..5e1b76ba0 100644
--- a/python/cucim/src/cucim/skimage/filters/_fft_based.py
+++ b/python/cucim/src/cucim/skimage/filters/_fft_based.py
@@ -5,6 +5,7 @@
 import numpy as np
 
 from .._shared.utils import _supported_float_type
+from .._vendored import pad
 
 
 def _get_nd_butterworth_filter(shape, factor, order, high_pass, real,
@@ -157,7 +158,7 @@ def butterworth(
         raise ValueError("npad must be >= 0")
     elif npad > 0:
         center_slice = tuple(slice(npad, s + npad) for s in image.shape)
-        image = cp.pad(image, npad, mode='edge')
+        image = pad(image, npad, mode='edge')
     fft_shape = (image.shape if channel_axis is None
                  else np.delete(image.shape, channel_axis))
     is_real = cp.isrealobj(image)
diff --git a/python/cucim/src/cucim/skimage/filters/_median_hist.py b/python/cucim/src/cucim/skimage/filters/_median_hist.py
index 03e10c688..1dfbd8d09 100644
--- a/python/cucim/src/cucim/skimage/filters/_median_hist.py
+++ b/python/cucim/src/cucim/skimage/filters/_median_hist.py
@@ -7,6 +7,7 @@
 import numpy as np
 
 from .._shared.utils import _to_np_mode
+from .._vendored import pad
 
 if hasattr(math, 'prod'):
     prod = math.prod
@@ -495,7 +496,7 @@ def _median_hist(image, footprint, output=None, mode='mirror', cval=0,
             pad_kwargs = dict(mode=mode, constant_values=cval)
         else:
             pad_kwargs = dict(mode=mode)
-        image = cp.pad(image, npad, **pad_kwargs)
+        image = pad(image, npad, **pad_kwargs)
         # must update n_rows, n_cols after padding!
         n_rows, n_cols = image.shape[:2]
 
diff --git a/python/cucim/src/cucim/skimage/filters/_sparse.py b/python/cucim/src/cucim/skimage/filters/_sparse.py
index 6c893d4d9..d202ce3e9 100644
--- a/python/cucim/src/cucim/skimage/filters/_sparse.py
+++ b/python/cucim/src/cucim/skimage/filters/_sparse.py
@@ -2,6 +2,7 @@
 import numpy as np
 
 from .._shared.utils import _supported_float_type, _to_np_mode
+from .._vendored import pad
 
 
 def _validate_window_size(axis_sizes):
@@ -118,7 +119,7 @@ def correlate_sparse(image, kernel, mode='reflect'):
     else:
         np_mode = _to_np_mode(mode)
         _validate_window_size(kernel.shape)
-        padded_image = cp.pad(
+        padded_image = pad(
             image,
             [(w // 2, w // 2) for w in kernel.shape],
             mode=np_mode,
diff --git a/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py b/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py
index 3f79d774b..15cc7d1c3 100644
--- a/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py
+++ b/python/cucim/src/cucim/skimage/filters/tests/test_gaussian.py
@@ -3,6 +3,7 @@
 import pytest
 
 from cucim.skimage._shared._warnings import expected_warnings
+from cucim.skimage._vendored import pad
 from cucim.skimage.filters._gaussian import difference_of_gaussians, gaussian
 
 
@@ -182,7 +183,7 @@ def test_shared_mem_check_fix(dtype, sigma):
 def test_deprecated_automatic_channel_detection():
     rgb = cp.zeros((5, 5, 3))
     rgb[1, 1] = cp.arange(1, 4)
-    gray = cp.pad(rgb, pad_width=((0, 0), (0, 0), (1, 0)))
+    gray = pad(rgb, pad_width=((0, 0), (0, 0), (1, 0)))
 
     # Warning is raised if channel_axis is not set and shape is (M, N, 3)
     with pytest.warns(
diff --git a/python/cucim/src/cucim/skimage/filters/thresholding.py b/python/cucim/src/cucim/skimage/filters/thresholding.py
index e0ffc803c..367eb35d8 100644
--- a/python/cucim/src/cucim/skimage/filters/thresholding.py
+++ b/python/cucim/src/cucim/skimage/filters/thresholding.py
@@ -18,6 +18,7 @@
 from .._shared.filters import gaussian
 from .._shared.utils import _supported_float_type, deprecate_kwarg, warn
 from .._shared.version_requirements import require
+from .._vendored import pad
 from ..exposure import histogram
 from ..transform import integral_image
 from ..util import dtype_limits
@@ -955,8 +956,8 @@ def _mean_std(image, w):
 
     float_dtype = _supported_float_type(image.dtype)
     pad_width = tuple((k // 2 + 1, k // 2) for k in w)
-    padded = cp.pad(image.astype(float_dtype, copy=False), pad_width,
-                    mode='reflect')
+    padded = pad(image.astype(float_dtype, copy=False), pad_width,
+                 mode='reflect')
 
     # Note: keep float64 integral images for accuracy. Outputs of
     # _correlate_sparse can later be safely cast to float_dtype
diff --git a/python/cucim/src/cucim/skimage/measure/_regionprops.py b/python/cucim/src/cucim/skimage/measure/_regionprops.py
index 941b14c32..daf9de458 100644
--- a/python/cucim/src/cucim/skimage/measure/_regionprops.py
+++ b/python/cucim/src/cucim/skimage/measure/_regionprops.py
@@ -9,6 +9,8 @@
 from cupyx.scipy import ndimage as ndi
 from scipy.ndimage import find_objects as cpu_find_objects
 
+from cucim.skimage._vendored import pad
+
 from . import _moments
 from ._regionprops_utils import euler_number, perimeter, perimeter_crofton
 
@@ -479,7 +481,7 @@ def feret_diameter_max(self):
 
         # TODO: implement marching cubes, etc.
         warn("feret diameter_max currently not implemented on GPU.")
-        identity_convex_hull = cp.pad(
+        identity_convex_hull = pad(
             self.image_convex, 2, mode="constant", constant_values=0
         )
         identity_convex_hull = cp.asnumpy(identity_convex_hull)
diff --git a/python/cucim/src/cucim/skimage/measure/_regionprops_utils.py b/python/cucim/src/cucim/skimage/measure/_regionprops_utils.py
index 358d6f7da..54812a11b 100644
--- a/python/cucim/src/cucim/skimage/measure/_regionprops_utils.py
+++ b/python/cucim/src/cucim/skimage/measure/_regionprops_utils.py
@@ -4,6 +4,8 @@
 import cupyx.scipy.ndimage as ndi
 import numpy as np
 
+from cucim.skimage._vendored import pad
+
 from .._shared.utils import deprecate_kwarg
 
 # Don't allocate STREL_* on GPU as we don't know in advance which device
@@ -146,7 +148,7 @@ def euler_number(image, connectivity=None):
 
     # as image can be a label image, transform it to binary
     image = (image > 0).astype(int)
-    image = cp.pad(image, pad_width=1, mode='constant')
+    image = pad(image, pad_width=1, mode='constant')
 
     # check connectivity
     if connectivity is None:
@@ -323,7 +325,7 @@ def perimeter_crofton(image, directions=4):
 
     # as image could be a label image, transform it to binary image
     image = (image > 0).astype(cp.uint8)
-    image = cp.pad(image, pad_width=1, mode="constant")
+    image = pad(image, pad_width=1, mode="constant")
     XF = ndi.convolve(image, cp.array([[0, 0, 0], [0, 1, 4], [0, 2, 8]]),
                       mode='constant', cval=0)
 
diff --git a/python/cucim/src/cucim/skimage/measure/block.py b/python/cucim/src/cucim/skimage/measure/block.py
index f21c543c9..3db2ea742 100644
--- a/python/cucim/src/cucim/skimage/measure/block.py
+++ b/python/cucim/src/cucim/skimage/measure/block.py
@@ -1,6 +1,7 @@
 import cupy as cp
 import numpy as np
 
+from .._vendored import pad
 from ..util import view_as_blocks
 
 
@@ -84,8 +85,8 @@ def block_reduce(image, block_size=2, func=cp.sum, cval=0, func_kwargs=None):
             after_width = 0
         pad_width.append((0, after_width))
 
-    image = cp.pad(image, pad_width=pad_width, mode='constant',
-                   constant_values=cval)
+    image = pad(image, pad_width=pad_width, mode='constant',
+                constant_values=cval)
 
     blocked = view_as_blocks(image, block_size)
 
diff --git a/python/cucim/src/cucim/skimage/measure/tests/test_regionprops.py b/python/cucim/src/cucim/skimage/measure/tests/test_regionprops.py
index f6be66c4a..67763b8eb 100644
--- a/python/cucim/src/cucim/skimage/measure/tests/test_regionprops.py
+++ b/python/cucim/src/cucim/skimage/measure/tests/test_regionprops.py
@@ -11,6 +11,7 @@
 
 from cucim.skimage import transform
 from cucim.skimage._shared._warnings import expected_warnings
+from cucim.skimage._vendored import pad
 from cucim.skimage.measure import (euler_number, perimeter, perimeter_crofton,
                                    regionprops, regionprops_table)
 from cucim.skimage.measure._regionprops import \
@@ -239,7 +240,7 @@ def test_bbox():
 
 
 def test_area_bbox():
-    padded = cp.pad(SAMPLE, 5, mode='constant')
+    padded = pad(SAMPLE, 5, mode='constant')
     bbox_area = regionprops(padded)[0].area_bbox
     assert_array_almost_equal(bbox_area, SAMPLE.size)
 
@@ -391,7 +392,7 @@ def test_coordinates_scaled():
 
 
 def test_slice():
-    padded = cp.pad(SAMPLE, ((2, 4), (5, 2)), mode="constant")
+    padded = pad(SAMPLE, ((2, 4), (5, 2)), mode="constant")
     nrow, ncol = SAMPLE.shape
     result = regionprops(padded)[0].slice
     expected = (slice(2, 2 + nrow), slice(5, 5 + ncol))
diff --git a/python/cucim/src/cucim/skimage/metrics/simple_metrics.py b/python/cucim/src/cucim/skimage/metrics/simple_metrics.py
index 1b29e602f..3e1a7bdbb 100644
--- a/python/cucim/src/cucim/skimage/metrics/simple_metrics.py
+++ b/python/cucim/src/cucim/skimage/metrics/simple_metrics.py
@@ -2,6 +2,7 @@
 from cupyx.scipy.stats import entropy
 
 from .._shared.utils import _supported_float_type, check_shape_equality, warn
+from .._vendored import pad
 from ..util.dtype import dtype_range
 
 __all__ = ['mean_squared_error',
@@ -188,7 +189,7 @@ def _pad_to(arr, shape):
         raise ValueError(f'Target shape {shape} cannot be smaller than input'
                          f'shape {arr.shape} along any axis.')
     padding = [(0, s - i) for s, i in zip(shape, arr.shape)]
-    return cp.pad(arr, pad_width=padding, mode='constant', constant_values=0)
+    return pad(arr, pad_width=padding, mode='constant', constant_values=0)
 
 
 def normalized_mutual_information(image0, image1, *, bins=100):
diff --git a/python/cucim/src/cucim/skimage/morphology/gray.py b/python/cucim/src/cucim/skimage/morphology/gray.py
index f900c1607..48f87a40e 100644
--- a/python/cucim/src/cucim/skimage/morphology/gray.py
+++ b/python/cucim/src/cucim/skimage/morphology/gray.py
@@ -8,6 +8,7 @@
 import cucim.skimage._vendored.ndimage as ndi
 
 from .._shared.utils import deprecate_kwarg
+from .._vendored import pad
 from ..util import crop
 from .footprints import _footprint_is_sequence, _shape_from_sequence
 from .misc import default_footprint
@@ -144,7 +145,7 @@ def func_out(image, footprint, out=None, *args, **kwargs):
                 axis_pad_width = 0
             pad_widths.append((axis_pad_width,) * 2)
         if padding:
-            image = cp.pad(image, pad_widths, mode='edge')
+            image = pad(image, pad_widths, mode='edge')
             out_temp = cp.empty_like(image)
         else:
             out_temp = out
diff --git a/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py b/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py
index e63521d6b..bab2f1887 100644
--- a/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py
+++ b/python/cucim/src/cucim/skimage/segmentation/_chan_vese.py
@@ -5,6 +5,7 @@
 from cucim.core.operations.morphology import distance_transform_edt
 
 from .._shared.utils import _supported_float_type, deprecate_kwarg
+from .._vendored import pad
 
 
 @cp.fuse()
@@ -23,7 +24,7 @@ def _fused_curvature(phi, x_start, x_end, y_start, y_end, ul, ur, ll, lr):
 def _cv_curvature(phi):
     """Returns the 'curvature' of a level set 'phi'.
     """
-    P = cp.pad(phi, 1, mode='edge')
+    P = pad(phi, 1, mode='edge')
     y_start = P[:-2, 1:-1]
     y_end = P[2:, 1:-1]
     x_start = P[1:-1, :-2]
@@ -103,7 +104,7 @@ def _cv_calculate_variation(image, phi, mu, lambda1, lambda2, dt):
     """Returns the variation of level set 'phi' based on algorithm parameters.
     """
     eta = 1e-16
-    P = cp.pad(phi, 1, mode='edge')
+    P = pad(phi, 1, mode='edge')
 
     x_end = P[1:-1, 2:]
     x_mid = P[1:-1, 1:-1]
diff --git a/python/cucim/src/cucim/skimage/transform/_warps.py b/python/cucim/src/cucim/skimage/transform/_warps.py
index 995961436..dfc617152 100644
--- a/python/cucim/src/cucim/skimage/transform/_warps.py
+++ b/python/cucim/src/cucim/skimage/transform/_warps.py
@@ -8,6 +8,7 @@
 from .._shared.utils import (_to_ndimage_mode, _validate_interpolation_order,
                              channel_as_last_axis, convert_to_float,
                              safe_as_int, warn)
+from .._vendored import pad
 from ..measure import block_reduce
 from ._geometric import (AffineTransform, ProjectiveTransform,
                          SimilarityTransform)
@@ -1203,14 +1204,14 @@ def _local_mean_weights(old_size, new_size, grid_mode, dtype):
         new_breaks = cp.linspace(0, old_size, num=new_size + 1, dtype=dtype)
     else:
         old, new = old_size - 1, new_size - 1
-        old_breaks = cp.pad(cp.linspace(0.5, old - 0.5, old, dtype=dtype),
-                            1, 'constant', constant_values=(0, old))
+        old_breaks = pad(cp.linspace(0.5, old - 0.5, old, dtype=dtype),
+                         1, 'constant', constant_values=(0, old))
         if new == 0:
             val = np.inf
         else:
             val = 0.5 * old / new
-        new_breaks = cp.pad(cp.linspace(val, old - val, new, dtype=dtype),
-                            1, 'constant', constant_values=(0, old))
+        new_breaks = pad(cp.linspace(val, old - val, new, dtype=dtype),
+                         1, 'constant', constant_values=(0, old))
 
     upper = cp.minimum(new_breaks[1:, np.newaxis], old_breaks[np.newaxis, 1:])
     lower = cp.maximum(new_breaks[:-1, np.newaxis],