diff --git a/dpctl/dptensor/numpy_usm_shared.py b/dpctl/dptensor/numpy_usm_shared.py index a2bd452350..2c790bc1dd 100644 --- a/dpctl/dptensor/numpy_usm_shared.py +++ b/dpctl/dptensor/numpy_usm_shared.py @@ -1,4 +1,4 @@ -##===---------- dparray.py - dpctl -------*- Python -*----===## +##===---------- numpy_usm_shared.py - dpctl -------*- Python -*----===## ## ## Data Parallel Control (dpCtl) ## @@ -19,7 +19,7 @@ ##===----------------------------------------------------------------------===## ### ### \file -### This file implements a dparray - USM aware implementation of ndarray. +### This file implements a numpy_usm_shared - USM aware implementation of ndarray. ##===----------------------------------------------------------------------===## import numpy as np @@ -70,12 +70,17 @@ class ndarray(np.ndarray): with a foreign allocator. """ + external_usm_checkers = [] + + def add_external_usm_checker(func): + ndarray.external_usm_checkers.append(func) + def __new__( subtype, shape, dtype=float, buffer=None, offset=0, strides=None, order=None ): # Create a new array. if buffer is None: - dprint("dparray::ndarray __new__ buffer None") + dprint("numpy_usm_shared::ndarray __new__ buffer None") nelems = np.prod(shape) dt = np.dtype(dtype) isz = dt.itemsize @@ -102,7 +107,7 @@ def __new__( return new_obj # zero copy if buffer is a usm backed array-like thing elif hasattr(buffer, array_interface_property): - dprint("dparray::ndarray __new__ buffer", array_interface_property) + dprint("numpy_usm_shared::ndarray __new__ buffer", array_interface_property) # also check for array interface new_obj = np.ndarray.__new__( subtype, @@ -124,7 +129,7 @@ def __new__( ) return new_obj else: - dprint("dparray::ndarray __new__ buffer not None and not sycl_usm") + dprint("numpy_usm_shared::ndarray __new__ buffer not None and not sycl_usm") nelems = np.prod(shape) # must copy ar = np.ndarray( @@ -158,6 +163,9 @@ def __new__( ) return new_obj + def __sycl_usm_array_interface__(self): + return self._getter_sycl_usm_array_interface() + def _getter_sycl_usm_array_interface_(self): ary_iface = self.__array_interface__ _base = _get_usm_base(self) @@ -186,6 +194,9 @@ def __array_finalize__(self, obj): # subclass of ndarray, including our own. if hasattr(obj, array_interface_property): return + for ext_checker in ndarray.external_usm_checkers: + if ext_checker(obj): + return if isinstance(obj, np.ndarray): ob = self while isinstance(ob, np.ndarray): @@ -200,7 +211,7 @@ def __array_finalize__(self, obj): ) # Tell Numba to not treat this type just like a NumPy ndarray but to propagate its type. - # This way it will use the custom dparray allocator. + # This way it will use the custom numpy_usm_shared allocator. __numba_no_subtype_ndarray__ = True # Convert to a NumPy ndarray. @@ -234,8 +245,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): else: return NotImplemented # Have to avoid recursive calls to array_ufunc here. - # If no out kwarg then we create a dparray out so that we get - # USM memory. However, if kwarg has dparray-typed out then + # If no out kwarg then we create a numpy_usm_shared out so that we get + # USM memory. However, if kwarg has numpy_usm_shared-typed out then # array_ufunc is called recursively so we cast out as regular # NumPy ndarray (having a USM data pointer). if kwargs.get("out", None) is None: @@ -246,7 +257,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): out_as_np = np.ndarray(out.shape, out.dtype, out) kwargs["out"] = out_as_np else: - # If they manually gave dparray as out kwarg then we have to also + # If they manually gave numpy_usm_shared as out kwarg then we have to also # cast as regular NumPy ndarray to avoid recursion. if isinstance(kwargs["out"], ndarray): out = kwargs["out"] @@ -271,7 +282,7 @@ def isdef(x): cname = c[0] if isdef(cname): continue - # For now we do the simple thing and copy the types from NumPy module into dparray module. + # For now we do the simple thing and copy the types from NumPy module into numpy_usm_shared module. new_func = "%s = np.%s" % (cname, cname) try: the_code = compile(new_func, "__init__", "exec") diff --git a/examples/cython/sycl_direct_linkage/README.md b/examples/cython/sycl_direct_linkage/README.md new file mode 100644 index 0000000000..fdd3d2a6e1 --- /dev/null +++ b/examples/cython/sycl_direct_linkage/README.md @@ -0,0 +1,57 @@ +# Example "sycl_direct_linkage" + +This Cython extension does not use dpCtl and links to SYCL directly. + +It exposes `columnwise_total` function that uses oneMKL to compute +totals for each column of its argument matrix in double precision, +expected as an ordinary NumPy array in C-contiguous layout. + +This functions performs the following steps: + + 1. Create a SYCL queue using default device selector + 2. Creates SYCL buffer around the matrix data + 3. Creates a vector `v_ones` with all elements being ones, + and allocates memory for the result. + 4. Calls oneMKL to compute xGEMV, as dot(v_ones, M) + 5. Returs the result as NumPy array + +This extension does not allow one to control the device/queue to +which execution of kernel is being schedules. + +A related example "sycl_buffer" modifies this example in that it uses +`dpCtl` to retrieve the current queue, allowing a user control the queue, +and the avoid the overhead of the queue creation. + +To illustrate the queue creation overhead in each call, compare execution of default queue, +which is Intel Gen9 GPU on OpenCL backend: + +``` +(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py +========== Executing warm-up ========== +NumPy result: [1. 1. 1. ... 1. 1. 1.] +SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.] +Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098) +Times for default_selector, inclusive of queue creation: +[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326] +Times for NumPy +[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887] +``` + +vs. timing when `dpctl`'s current queue is being reused: + +``` +(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py +========== Executing warm-up ========== +NumPy result: [1. 1. 1. ... 1. 1. 1.] +SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.] +SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.] +Times for 'opencl:cpu:0' +[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345] +Times for 'opencl:gpu:0' +[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791] +Times for NumPy +[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923] +``` + +So the overhead of ``sycl::queue`` creation per call is roughly comparable with the time to +execute the actual computation. \ No newline at end of file diff --git a/examples/cython/sycl_direct_linkage/bench.py b/examples/cython/sycl_direct_linkage/bench.py new file mode 100644 index 0000000000..254bf665b9 --- /dev/null +++ b/examples/cython/sycl_direct_linkage/bench.py @@ -0,0 +1,36 @@ +import dpctl +import syclbuffer_naive as sb +import numpy as np + +X = np.full((10 ** 4, 4098), 1e-4, dtype="d") + +# warm-up +print("=" * 10 + " Executing warm-up " + "=" * 10) +print("NumPy result: ", X.sum(axis=0)) + +print( + "SYCL(default_device) result: {}".format( + sb.columnwise_total(X), + ) +) + +import timeit + +print( + "Running time of 100 calls to columnwise_total on matrix with shape {}".format( + X.shape + ) +) + +print("Times for default_selector, inclusive of queue creation:") +print( + timeit.repeat( + stmt="sb.columnwise_total(X)", + setup="sb.columnwise_total(X)", # ensure JIT compilation is not counted + number=100, + globals=globals(), + ) +) + +print("Times for NumPy") +print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals())) diff --git a/examples/cython/sycl_direct_linkage/run.py b/examples/cython/sycl_direct_linkage/run.py index ed9597add1..cc8358f762 100644 --- a/examples/cython/sycl_direct_linkage/run.py +++ b/examples/cython/sycl_direct_linkage/run.py @@ -1,4 +1,4 @@ -import syclbuffer as sb +import syclbuffer_naive as sb import numpy as np X = np.random.randn(20, 10) diff --git a/examples/cython/sycl_direct_linkage/sycl_function.cpp b/examples/cython/sycl_direct_linkage/sycl_function.cpp index d9d8065f3e..ad48580aaf 100644 --- a/examples/cython/sycl_direct_linkage/sycl_function.cpp +++ b/examples/cython/sycl_direct_linkage/sycl_function.cpp @@ -1,6 +1,6 @@ #include #include "sycl_function.hpp" -#include "mkl_blas_sycl.hpp" +#include #include "mkl.h" int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) { diff --git a/examples/cython/usm_memory/blackscholes.pyx b/examples/cython/usm_memory/blackscholes.pyx index ab9b8da9f3..7d7867abe3 100644 --- a/examples/cython/usm_memory/blackscholes.pyx +++ b/examples/cython/usm_memory/blackscholes.pyx @@ -2,7 +2,7 @@ # distutils: language=c++ cimport dpctl as c_dpctl -cimport dpctl._memory as c_dpctl_mem +cimport dpctl.memory as c_dpctl_mem cimport numpy as cnp from cython cimport floating diff --git a/examples/cython/usm_memory/sycl_blackscholes.cpp b/examples/cython/usm_memory/sycl_blackscholes.cpp index 734aabdd1d..759e863fda 100644 --- a/examples/cython/usm_memory/sycl_blackscholes.cpp +++ b/examples/cython/usm_memory/sycl_blackscholes.cpp @@ -1,7 +1,8 @@ #include +#include +#include #include "dpctl_sycl_types.h" #include "sycl_blackscholes.hpp" -#include "mkl_rng_sycl_device.hpp" template class black_scholes_kernel;