Skip to content

Commit

Permalink
Merge branch 'master' into gold/2021
Browse files Browse the repository at this point in the history
  • Loading branch information
PokhodenkoSA committed Dec 15, 2020
2 parents 97cc81f + 0e8d4fb commit d157ee0
Show file tree
Hide file tree
Showing 7 changed files with 119 additions and 14 deletions.
31 changes: 21 additions & 10 deletions dpctl/dptensor/numpy_usm_shared.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
##===---------- dparray.py - dpctl -------*- Python -*----===##
##===---------- numpy_usm_shared.py - dpctl -------*- Python -*----===##
##
## Data Parallel Control (dpCtl)
##
Expand All @@ -19,7 +19,7 @@
##===----------------------------------------------------------------------===##
###
### \file
### This file implements a dparray - USM aware implementation of ndarray.
### This file implements a numpy_usm_shared - USM aware implementation of ndarray.
##===----------------------------------------------------------------------===##

import numpy as np
Expand Down Expand Up @@ -70,12 +70,17 @@ class ndarray(np.ndarray):
with a foreign allocator.
"""

external_usm_checkers = []

def add_external_usm_checker(func):
ndarray.external_usm_checkers.append(func)

def __new__(
subtype, shape, dtype=float, buffer=None, offset=0, strides=None, order=None
):
# Create a new array.
if buffer is None:
dprint("dparray::ndarray __new__ buffer None")
dprint("numpy_usm_shared::ndarray __new__ buffer None")
nelems = np.prod(shape)
dt = np.dtype(dtype)
isz = dt.itemsize
Expand All @@ -102,7 +107,7 @@ def __new__(
return new_obj
# zero copy if buffer is a usm backed array-like thing
elif hasattr(buffer, array_interface_property):
dprint("dparray::ndarray __new__ buffer", array_interface_property)
dprint("numpy_usm_shared::ndarray __new__ buffer", array_interface_property)
# also check for array interface
new_obj = np.ndarray.__new__(
subtype,
Expand All @@ -124,7 +129,7 @@ def __new__(
)
return new_obj
else:
dprint("dparray::ndarray __new__ buffer not None and not sycl_usm")
dprint("numpy_usm_shared::ndarray __new__ buffer not None and not sycl_usm")
nelems = np.prod(shape)
# must copy
ar = np.ndarray(
Expand Down Expand Up @@ -158,6 +163,9 @@ def __new__(
)
return new_obj

def __sycl_usm_array_interface__(self):
return self._getter_sycl_usm_array_interface()

def _getter_sycl_usm_array_interface_(self):
ary_iface = self.__array_interface__
_base = _get_usm_base(self)
Expand Down Expand Up @@ -186,6 +194,9 @@ def __array_finalize__(self, obj):
# subclass of ndarray, including our own.
if hasattr(obj, array_interface_property):
return
for ext_checker in ndarray.external_usm_checkers:
if ext_checker(obj):
return
if isinstance(obj, np.ndarray):
ob = self
while isinstance(ob, np.ndarray):
Expand All @@ -200,7 +211,7 @@ def __array_finalize__(self, obj):
)

# Tell Numba to not treat this type just like a NumPy ndarray but to propagate its type.
# This way it will use the custom dparray allocator.
# This way it will use the custom numpy_usm_shared allocator.
__numba_no_subtype_ndarray__ = True

# Convert to a NumPy ndarray.
Expand Down Expand Up @@ -234,8 +245,8 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
else:
return NotImplemented
# Have to avoid recursive calls to array_ufunc here.
# If no out kwarg then we create a dparray out so that we get
# USM memory. However, if kwarg has dparray-typed out then
# If no out kwarg then we create a numpy_usm_shared out so that we get
# USM memory. However, if kwarg has numpy_usm_shared-typed out then
# array_ufunc is called recursively so we cast out as regular
# NumPy ndarray (having a USM data pointer).
if kwargs.get("out", None) is None:
Expand All @@ -246,7 +257,7 @@ def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
out_as_np = np.ndarray(out.shape, out.dtype, out)
kwargs["out"] = out_as_np
else:
# If they manually gave dparray as out kwarg then we have to also
# If they manually gave numpy_usm_shared as out kwarg then we have to also
# cast as regular NumPy ndarray to avoid recursion.
if isinstance(kwargs["out"], ndarray):
out = kwargs["out"]
Expand All @@ -271,7 +282,7 @@ def isdef(x):
cname = c[0]
if isdef(cname):
continue
# For now we do the simple thing and copy the types from NumPy module into dparray module.
# For now we do the simple thing and copy the types from NumPy module into numpy_usm_shared module.
new_func = "%s = np.%s" % (cname, cname)
try:
the_code = compile(new_func, "__init__", "exec")
Expand Down
57 changes: 57 additions & 0 deletions examples/cython/sycl_direct_linkage/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,57 @@
# Example "sycl_direct_linkage"

This Cython extension does not use dpCtl and links to SYCL directly.

It exposes `columnwise_total` function that uses oneMKL to compute
totals for each column of its argument matrix in double precision,
expected as an ordinary NumPy array in C-contiguous layout.

This functions performs the following steps:

1. Create a SYCL queue using default device selector
2. Creates SYCL buffer around the matrix data
3. Creates a vector `v_ones` with all elements being ones,
and allocates memory for the result.
4. Calls oneMKL to compute xGEMV, as dot(v_ones, M)
5. Returs the result as NumPy array

This extension does not allow one to control the device/queue to
which execution of kernel is being schedules.

A related example "sycl_buffer" modifies this example in that it uses
`dpCtl` to retrieve the current queue, allowing a user control the queue,
and the avoid the overhead of the queue creation.

To illustrate the queue creation overhead in each call, compare execution of default queue,
which is Intel Gen9 GPU on OpenCL backend:

```
(idp) [11:24:38 ansatnuc04 sycl_direct_linkage]$ SYCL_BE=PI_OPENCL python bench.py
========== Executing warm-up ==========
NumPy result: [1. 1. 1. ... 1. 1. 1.]
SYCL(default_device) result: [1. 1. 1. ... 1. 1. 1.]
Running time of 100 calls to columnwise_total on matrix with shape (10000, 4098)
Times for default_selector, inclusive of queue creation:
[19.384219504892826, 19.49932464491576, 19.613155928440392, 19.64031868893653, 19.752969074994326]
Times for NumPy
[3.5394036192446947, 3.498957809060812, 3.4925728561356664, 3.5036555202677846, 3.493739523924887]
```

vs. timing when `dpctl`'s current queue is being reused:

```
(idp) [11:29:14 ansatnuc04 sycl_buffer]$ python bench.py
========== Executing warm-up ==========
NumPy result: [1. 1. 1. ... 1. 1. 1.]
SYCL(Intel(R) Core(TM) i7-10710U CPU @ 1.10GHz) result: [1. 1. 1. ... 1. 1. 1.]
SYCL(Intel(R) Graphics Gen9 [0x9bca]) result: [1. 1. 1. ... 1. 1. 1.]
Times for 'opencl:cpu:0'
[2.9164800881408155, 2.8714500251226127, 2.9770236839540303, 2.913622073829174, 2.7949972581118345]
Times for 'opencl:gpu:0'
[9.529508924111724, 10.288004886358976, 10.189113245811313, 10.197128206957132, 10.26169267296791]
Times for NumPy
[3.4809365631081164, 3.42917942116037, 3.42471009073779, 3.3689011191017926, 3.4336009239777923]
```

So the overhead of ``sycl::queue`` creation per call is roughly comparable with the time to
execute the actual computation.
36 changes: 36 additions & 0 deletions examples/cython/sycl_direct_linkage/bench.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,36 @@
import dpctl
import syclbuffer_naive as sb
import numpy as np

X = np.full((10 ** 4, 4098), 1e-4, dtype="d")

# warm-up
print("=" * 10 + " Executing warm-up " + "=" * 10)
print("NumPy result: ", X.sum(axis=0))

print(
"SYCL(default_device) result: {}".format(
sb.columnwise_total(X),
)
)

import timeit

print(
"Running time of 100 calls to columnwise_total on matrix with shape {}".format(
X.shape
)
)

print("Times for default_selector, inclusive of queue creation:")
print(
timeit.repeat(
stmt="sb.columnwise_total(X)",
setup="sb.columnwise_total(X)", # ensure JIT compilation is not counted
number=100,
globals=globals(),
)
)

print("Times for NumPy")
print(timeit.repeat(stmt="X.sum(axis=0)", number=100, globals=globals()))
2 changes: 1 addition & 1 deletion examples/cython/sycl_direct_linkage/run.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
import syclbuffer as sb
import syclbuffer_naive as sb
import numpy as np

X = np.random.randn(20, 10)
Expand Down
2 changes: 1 addition & 1 deletion examples/cython/sycl_direct_linkage/sycl_function.cpp
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#include <CL/sycl.hpp>
#include "sycl_function.hpp"
#include "mkl_blas_sycl.hpp"
#include <oneapi/mkl.hpp>
#include "mkl.h"

int c_columnwise_total(cl::sycl::queue &q, size_t n, size_t m, double *mat, double *ct) {
Expand Down
2 changes: 1 addition & 1 deletion examples/cython/usm_memory/blackscholes.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@
# distutils: language=c++

cimport dpctl as c_dpctl
cimport dpctl._memory as c_dpctl_mem
cimport dpctl.memory as c_dpctl_mem
cimport numpy as cnp
from cython cimport floating

Expand Down
3 changes: 2 additions & 1 deletion examples/cython/usm_memory/sycl_blackscholes.cpp
Original file line number Diff line number Diff line change
@@ -1,7 +1,8 @@
#include <CL/sycl.hpp>
#include <oneapi/mkl.hpp>
#include <oneapi/mkl/rng/device.hpp>
#include "dpctl_sycl_types.h"
#include "sycl_blackscholes.hpp"
#include "mkl_rng_sycl_device.hpp"

template<typename T>
class black_scholes_kernel;
Expand Down

0 comments on commit d157ee0

Please sign in to comment.