diff --git a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp index 491fb12126..ed06d9a774 100644 --- a/dpctl/tensor/libtensor/include/kernels/accumulators.hpp +++ b/dpctl/tensor/libtensor/include/kernels/accumulators.hpp @@ -26,13 +26,13 @@ #include #include #include -#include #include #include #include +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" namespace dpctl { @@ -43,8 +43,6 @@ namespace kernels namespace accumulators { -namespace py = pybind11; - using namespace dpctl::tensor::offset_utils; template T ceiling_quotient(T n, T m) @@ -437,7 +435,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)( size_t, const char *, int, - const py::ssize_t *, + const ssize_t *, char *, std::vector &, const std::vector &); @@ -447,7 +445,7 @@ size_t accumulate_strided_impl(sycl::queue &q, size_t n_elems, const char *mask, int nd, - const py::ssize_t *shape_strides, + const ssize_t *shape_strides, char *cumsum, std::vector &host_tasks, const std::vector &depends = {}) diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp index 522baadc6d..46468de2e0 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp @@ -25,13 +25,13 @@ #pragma once #include #include -#include #include #include #include +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" namespace dpctl { @@ -42,8 +42,6 @@ namespace kernels namespace indexing { -namespace py = pybind11; - using namespace dpctl::tensor::offset_utils; template (orthog_i)); + orthog_src_dst_indexer(static_cast(orthog_i)); size_t total_src_offset = masked_src_indexer(masked_i) + orthog_offsets.get_first_offset(); @@ -161,7 +159,7 @@ struct MaskedPlaceStridedFunctor // + 1 : 1) if (mask_set) { auto orthog_offsets = - orthog_dst_rhs_indexer(static_cast(orthog_i)); + orthog_dst_rhs_indexer(static_cast(orthog_i)); size_t total_dst_offset = masked_dst_indexer(masked_i) + orthog_offsets.get_first_offset(); @@ -199,28 +197,28 @@ class masked_extract_all_slices_strided_impl_krn; typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)( sycl::queue &, - py::ssize_t, + ssize_t, const char *, const char *, char *, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, const std::vector &); template sycl::event masked_extract_all_slices_strided_impl( sycl::queue &exec_q, - py::ssize_t iteration_size, + ssize_t iteration_size, const char *src_p, const char *cumsum_p, char *dst_p, int nd, - const py::ssize_t + const ssize_t *packed_src_shape_strides, // [src_shape, src_strides], length 2*nd - py::ssize_t dst_size, // dst is 1D - py::ssize_t dst_stride, + ssize_t dst_size, // dst is 1D + ssize_t dst_stride, const std::vector &depends = {}) { // using MaskedExtractStridedFunctor; @@ -230,7 +228,7 @@ sycl::event masked_extract_all_slices_strided_impl( TwoZeroOffsets_Indexer orthog_src_dst_indexer{}; - /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides); Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride); @@ -254,19 +252,19 @@ sycl::event masked_extract_all_slices_strided_impl( typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)( sycl::queue &, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, const char *, const char *, char *, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, const std::vector &); template sycl::event masked_extract_some_slices_strided_impl( sycl::queue &exec_q, - py::ssize_t orthog_nelems, - py::ssize_t masked_nelems, + ssize_t orthog_nelems, + ssize_t masked_nelems, const char *src_p, const char *cumsum_p, char *dst_p, int orthog_nd, - const py::ssize_t + const ssize_t *packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides, // ortho_dst_strides], length // 3*ortho_nd - py::ssize_t ortho_src_offset, - py::ssize_t ortho_dst_offset, + ssize_t ortho_src_offset, + ssize_t ortho_dst_offset, int masked_nd, - const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape, - // masked_src_strides], - // length 2*masked_nd - py::ssize_t masked_dst_size, // mask_dst is 1D - py::ssize_t masked_dst_stride, + const ssize_t *packed_masked_src_shape_strides, // [masked_src_shape, + // masked_src_strides], + // length 2*masked_nd + ssize_t masked_dst_size, // mask_dst is 1D + ssize_t masked_dst_stride, const std::vector &depends = {}) { // using MaskedExtractStridedFunctor; @@ -381,33 +379,33 @@ class masked_place_all_slices_strided_impl_krn; typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)( sycl::queue &, - py::ssize_t, + ssize_t, char *, const char *, const char *, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, const std::vector &); template sycl::event masked_place_all_slices_strided_impl( sycl::queue &exec_q, - py::ssize_t iteration_size, + ssize_t iteration_size, char *dst_p, const char *cumsum_p, const char *rhs_p, int nd, - const py::ssize_t + const ssize_t *packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd - py::ssize_t rhs_size, // rhs is 1D - py::ssize_t rhs_stride, + ssize_t rhs_size, // rhs is 1D + ssize_t rhs_stride, const std::vector &depends = {}) { TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{}; - /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides); Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride); @@ -431,19 +429,19 @@ sycl::event masked_place_all_slices_strided_impl( typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)( sycl::queue &, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, char *, const char *, const char *, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, int, - py::ssize_t const *, - py::ssize_t, - py::ssize_t, + ssize_t const *, + ssize_t, + ssize_t, const std::vector &); template sycl::event masked_place_some_slices_strided_impl( sycl::queue &exec_q, - py::ssize_t orthog_nelems, - py::ssize_t masked_nelems, + ssize_t orthog_nelems, + ssize_t masked_nelems, char *dst_p, const char *cumsum_p, const char *rhs_p, int orthog_nd, - const py::ssize_t + const ssize_t *packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides, // ortho_rhs_strides], length // 3*ortho_nd - py::ssize_t ortho_dst_offset, - py::ssize_t ortho_rhs_offset, + ssize_t ortho_dst_offset, + ssize_t ortho_rhs_offset, int masked_nd, - const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape, - // masked_dst_strides], - // length 2*masked_nd - py::ssize_t masked_rhs_size, // mask_dst is 1D - py::ssize_t masked_rhs_stride, + const ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape, + // masked_dst_strides], + // length 2*masked_nd + ssize_t masked_rhs_size, // mask_dst is 1D + ssize_t masked_rhs_stride, const std::vector &depends = {}) { TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{ orthog_nd, ortho_dst_offset, ortho_rhs_offset, packed_ortho_dst_rhs_shape_strides}; - /* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const + /* StridedIndexer(int _nd, ssize_t _offset, ssize_t const * *_packed_shape_strides) */ StridedIndexer masked_dst_indexer{masked_nd, 0, packed_masked_dst_shape_strides}; @@ -550,22 +548,22 @@ template class non_zero_indexes_krn; typedef sycl::event (*non_zero_indexes_fn_ptr_t)( sycl::queue &, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, int, const char *, char *, - const py::ssize_t *, + const ssize_t *, std::vector const &); template sycl::event non_zero_indexes_impl(sycl::queue &exec_q, - py::ssize_t iter_size, - py::ssize_t nz_elems, + ssize_t iter_size, + ssize_t nz_elems, int nd, const char *cumsum_cp, char *indexes_cp, - const py::ssize_t *mask_shape, + const ssize_t *mask_shape, std::vector const &depends) { const indT1 *cumsum_data = reinterpret_cast(cumsum_cp); @@ -582,11 +580,11 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q, auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0); bool cond = (cs_curr_val == cs_prev_val); - py::ssize_t i_ = static_cast(i); + ssize_t i_ = static_cast(i); for (int dim = nd; --dim > 0;) { auto sd = mask_shape[dim]; - py::ssize_t q = i_ / sd; - py::ssize_t r = (i_ - q * sd); + ssize_t q = i_ / sd; + ssize_t r = (i_ - q * sd); if (cond) { indexes_data[cs_curr_val + dim * nz_elems] = static_cast(r); diff --git a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp index 877680c8bf..ee64bd2e44 100644 --- a/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/boolean_reductions.hpp @@ -31,15 +31,12 @@ #include #include -#include "pybind11/pybind11.h" - +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -namespace py = pybind11; - namespace dpctl { namespace tensor @@ -179,16 +176,16 @@ struct SequentialBooleanReduction { auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); - const py::ssize_t &inp_iter_offset = + const ssize_t &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const py::ssize_t &out_iter_offset = + const ssize_t &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT red_val(identity_); for (size_t m = 0; m < reduction_max_gid_; ++m) { - py::ssize_t inp_reduction_offset = - static_cast(inp_reduced_dims_indexer_(m)); - py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + ssize_t inp_reduction_offset = + static_cast(inp_reduced_dims_indexer_(m)); + ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; // must convert to boolean first to handle nans using dpctl::tensor::type_utils::convert_impl; @@ -249,9 +246,9 @@ typedef sycl::event (*boolean_reduction_contig_impl_fn_ptr)( size_t, const char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -269,9 +266,9 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q, size_t reduction_nelems, const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t red_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t red_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -298,8 +295,8 @@ boolean_reduction_axis1_contig_impl(sycl::queue &exec_q, using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -425,9 +422,9 @@ struct StridedBooleanReduction const size_t wg_size = it.get_local_range(0); auto inp_out_iter_offsets_ = inp_out_iter_indexer_(reduction_id); - const py::ssize_t &inp_iter_offset = + const ssize_t &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const py::ssize_t &out_iter_offset = + const ssize_t &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT local_red_val(identity_); @@ -438,9 +435,9 @@ struct StridedBooleanReduction for (size_t arg_reduce_gid = arg_reduce_gid0; arg_reduce_gid < arg_reduce_gid_max; arg_reduce_gid += wg_size) { - py::ssize_t inp_reduction_offset = static_cast( - inp_reduced_dims_indexer_(arg_reduce_gid)); - py::ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; + ssize_t inp_reduction_offset = + static_cast(inp_reduced_dims_indexer_(arg_reduce_gid)); + ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; // must convert to boolean first to handle nans using dpctl::tensor::type_utils::convert_impl; @@ -470,9 +467,9 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q, size_t reduction_nelems, const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t red_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t red_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -507,8 +504,8 @@ boolean_reduction_axis0_contig_impl(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; constexpr size_t preferred_reductions_per_wi = 4; size_t reductions_per_wi = @@ -582,12 +579,12 @@ typedef sycl::event (*boolean_reduction_strided_impl_fn_ptr)( const char *, char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, int, - const py::ssize_t *, - py::ssize_t, + const ssize_t *, + ssize_t, const std::vector &); template @@ -598,12 +595,12 @@ boolean_reduction_strided_impl(sycl::queue &exec_q, const char *arg_cp, char *res_cp, int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp); @@ -647,8 +644,8 @@ boolean_reduction_strided_impl(sycl::queue &exec_q, using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; - const py::ssize_t *const &res_shape = iter_shape_and_strides; - const py::ssize_t *const &res_strides = + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = iter_shape_and_strides + 2 * iter_nd; IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); diff --git a/dpctl/tensor/libtensor/include/kernels/clip.hpp b/dpctl/tensor/libtensor/include/kernels/clip.hpp index aff1acb071..6d9bae6ed5 100644 --- a/dpctl/tensor/libtensor/include/kernels/clip.hpp +++ b/dpctl/tensor/libtensor/include/kernels/clip.hpp @@ -23,19 +23,16 @@ //===----------------------------------------------------------------------===// #pragma once -#include "pybind11/numpy.h" -#include "pybind11/stl.h" -#include #include #include #include -#include +#include #include +#include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" #include "utils/type_utils.hpp" namespace dpctl @@ -47,9 +44,6 @@ namespace kernels namespace clip { -namespace py = pybind11; -namespace td_ns = dpctl::tensor::type_dispatch; - using namespace dpctl::tensor::offset_utils; using dpctl::tensor::kernels::alignment_utils:: @@ -257,7 +251,7 @@ template class ClipStridedFunctor void operator()(sycl::id<1> id) const { size_t gid = id[0]; - auto offsets = indexer(static_cast(gid)); + auto offsets = indexer(static_cast(gid)); dst_p[offsets.get_fourth_offset()] = clip( x_p[offsets.get_first_offset()], min_p[offsets.get_second_offset()], max_p[offsets.get_third_offset()]); @@ -274,11 +268,11 @@ typedef sycl::event (*clip_strided_impl_fn_ptr_t)( const char *, const char *, char *, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -289,11 +283,11 @@ sycl::event clip_strided_impl(sycl::queue &q, const char *min_cp, const char *max_cp, char *dst_cp, - const py::ssize_t *shape_strides, - py::ssize_t x_offset, - py::ssize_t min_offset, - py::ssize_t max_offset, - py::ssize_t dst_offset, + const ssize_t *shape_strides, + ssize_t x_offset, + ssize_t min_offset, + ssize_t max_offset, + ssize_t dst_offset, const std::vector &depends) { const T *x_tp = reinterpret_cast(x_cp); diff --git a/dpctl/tensor/libtensor/include/kernels/constructors.hpp b/dpctl/tensor/libtensor/include/kernels/constructors.hpp index c28033d23d..4cab7c213c 100644 --- a/dpctl/tensor/libtensor/include/kernels/constructors.hpp +++ b/dpctl/tensor/libtensor/include/kernels/constructors.hpp @@ -24,11 +24,11 @@ //===----------------------------------------------------------------------===// #pragma once +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" #include "utils/strided_iters.hpp" #include "utils/type_utils.hpp" #include -#include #include namespace dpctl @@ -48,37 +48,8 @@ template class linear_sequence_step_kernel; template class linear_sequence_affine_kernel; template class eye_kernel; -namespace py = pybind11; using namespace dpctl::tensor::offset_utils; -/* =========== Unboxing Python scalar =============== */ - -/*! - * @brief Cast pybind11 class managing Python object to specified type `T`. - * @defgroup CtorKernels - */ -template T unbox_py_scalar(const py::object &o) -{ - return py::cast(o); -} - -template <> inline sycl::half unbox_py_scalar(const py::object &o) -{ - float tmp = py::cast(o); - return static_cast(tmp); -} - -// Constructor to populate tensor with linear sequence defined by -// start and step data - -typedef sycl::event (*lin_space_step_fn_ptr_t)( - sycl::queue &, - size_t, // num_elements - const py::object &start, - const py::object &step, - char *, // dst_data_ptr - const std::vector &); - template class LinearSequenceStepFunctor { private: @@ -142,74 +113,9 @@ sycl::event lin_space_step_impl(sycl::queue &exec_q, return lin_space_step_event; } -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting value and increment - * given as Python objects. - * - * @param q Sycl queue to which the kernel is submitted - * @param nelems Length of the sequence - * @param start Starting value of the sequence as Python object. Must be - * convertible to array element data type `Ty`. - * @param step Increment of the sequence as Python object. Must be convertible - * to array element data type `Ty`. - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_step_impl(sycl::queue &exec_q, - size_t nelems, - const py::object &start, - const py::object &step, - char *array_data, - const std::vector &depends) -{ - Ty start_v; - Ty step_v; - try { - start_v = unbox_py_scalar(start); - step_v = unbox_py_scalar(step); - } catch (const py::error_already_set &e) { - throw; - } - - auto lin_space_step_event = lin_space_step_impl( - exec_q, nelems, start_v, step_v, array_data, depends); - - return lin_space_step_event; -} - -/*! - * @brief Factor to get function pointer of type `fnT` for array with elements - * of type `Ty`. - * @defgroup CtorKernels - */ -template struct LinSpaceStepFactory -{ - fnT get() - { - fnT f = lin_space_step_impl; - return f; - } -}; - // Constructor to populate tensor with linear sequence defined by // start and and data -typedef sycl::event (*lin_space_affine_fn_ptr_t)( - sycl::queue &, - size_t, // num_elements - const py::object &start, - const py::object &end, - bool include_endpoint, - char *, // dst_data_ptr - const std::vector &); - template class LinearSequenceAffineFunctor { private: @@ -312,70 +218,8 @@ sycl::event lin_space_affine_impl(sycl::queue &exec_q, return lin_space_affine_event; } -/*! - * @brief Function to submit kernel to populate given contiguous memory - * allocation with linear sequence specified by starting and end values given - * as Python objects. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence - * @param start Stating value of the sequence as Python object. Must be - * convertible to array data element type `Ty`. - * @param end End-value of the sequence as Python object. Must be convertible - * to array data element type `Ty`. - * @param include_endpoint Whether the end-value is included in the sequence - * @param array_data Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event lin_space_affine_impl(sycl::queue &exec_q, - size_t nelems, - const py::object &start, - const py::object &end, - bool include_endpoint, - char *array_data, - const std::vector &depends) -{ - Ty start_v, end_v; - try { - start_v = unbox_py_scalar(start); - end_v = unbox_py_scalar(end); - } catch (const py::error_already_set &e) { - throw; - } - - auto lin_space_affine_event = lin_space_affine_impl( - exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); - - return lin_space_affine_event; -} - -/*! - * @brief Factory to get function pointer of type `fnT` for array data type - * `Ty`. - */ -template struct LinSpaceAffineFactory -{ - fnT get() - { - fnT f = lin_space_affine_impl; - return f; - } -}; - /* ================ Full ================== */ -typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, - size_t, - const py::object &, - char *, - const std::vector &); - /*! * @brief Function to submit kernel to fill given contiguous memory allocation * with specified value. @@ -408,58 +252,13 @@ sycl::event full_contig_impl(sycl::queue &q, return fill_ev; } -/*! - * @brief Function to submit kernel to fill given contiguous memory allocation - * with specified value. - * - * @param exec_q Sycl queue to which kernel is submitted for execution. - * @param nelems Length of the sequence - * @param py_value Python object representing the value to fill the array with. - * Must be convertible to `dstTy`. - * @param dst_p Kernel accessible USM pointer to the start of array to be - * populated. - * @param depends List of events to wait for before starting computations, if - * any. - * - * @return Event to wait on to ensure that computation completes. - * @defgroup CtorKernels - */ -template -sycl::event full_contig_impl(sycl::queue &exec_q, - size_t nelems, - const py::object &py_value, - char *dst_p, - const std::vector &depends) -{ - dstTy fill_v; - try { - fill_v = unbox_py_scalar(py_value); - } catch (const py::error_already_set &e) { - throw; - } - - sycl::event fill_ev = - full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); - - return fill_ev; -} - -template struct FullContigFactory -{ - fnT get() - { - fnT f = full_contig_impl; - return f; - } -}; - /* ================ Eye ================== */ typedef sycl::event (*eye_fn_ptr_t)(sycl::queue &, size_t nelems, // num_elements - py::ssize_t start, - py::ssize_t end, - py::ssize_t step, + ssize_t start, + ssize_t end, + ssize_t step, char *, // dst_data_ptr const std::vector &); @@ -467,15 +266,15 @@ template class EyeFunctor { private: Ty *p = nullptr; - py::ssize_t start_v; - py::ssize_t end_v; - py::ssize_t step_v; + ssize_t start_v; + ssize_t end_v; + ssize_t step_v; public: EyeFunctor(char *dst_p, - const py::ssize_t v0, - const py::ssize_t v1, - const py::ssize_t dv) + const ssize_t v0, + const ssize_t v1, + const ssize_t dv) : p(reinterpret_cast(dst_p)), start_v(v0), end_v(v1), step_v(dv) { } @@ -483,7 +282,7 @@ template class EyeFunctor void operator()(sycl::id<1> wiid) const { Ty set_v = 0; - py::ssize_t i = static_cast(wiid.get(0)); + ssize_t i = static_cast(wiid.get(0)); if (i >= start_v and i <= end_v) { if ((i - start_v) % step_v == 0) { set_v = 1; @@ -511,9 +310,9 @@ template class EyeFunctor template sycl::event eye_impl(sycl::queue &exec_q, size_t nelems, - const py::ssize_t start, - const py::ssize_t end, - const py::ssize_t step, + const ssize_t start, + const ssize_t end, + const ssize_t step, char *array_data, const std::vector &depends) { @@ -545,13 +344,13 @@ template struct EyeFactory // define function type typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, - py::ssize_t, // inner_range //py::ssize_t - py::ssize_t, // outer_range - char *, // src_data_ptr - char *, // dst_data_ptr - py::ssize_t, // nd - py::ssize_t *, // shape_and_strides - py::ssize_t, // k + ssize_t, // inner_range //ssize_t + ssize_t, // outer_range + char *, // src_data_ptr + char *, // dst_data_ptr + ssize_t, // nd + ssize_t *, // shape_and_strides + ssize_t, // k const std::vector &, const std::vector &); @@ -580,21 +379,21 @@ typedef sycl::event (*tri_fn_ptr_t)(sycl::queue &, template class tri_kernel; template sycl::event tri_impl(sycl::queue &exec_q, - py::ssize_t inner_range, - py::ssize_t outer_range, + ssize_t inner_range, + ssize_t outer_range, char *src_p, char *dst_p, - py::ssize_t nd, - py::ssize_t *shape_and_strides, - py::ssize_t k, + ssize_t nd, + ssize_t *shape_and_strides, + ssize_t k, const std::vector &depends, const std::vector &additional_depends) { constexpr int d2 = 2; - py::ssize_t src_s = nd; - py::ssize_t dst_s = 2 * nd; - py::ssize_t nd_1 = nd - 1; - py::ssize_t nd_2 = nd - 2; + ssize_t src_s = nd; + ssize_t dst_s = 2 * nd; + ssize_t nd_1 = nd - 1; + ssize_t nd_2 = nd - 2; Ty *src = reinterpret_cast(src_p); Ty *dst = reinterpret_cast(dst_p); @@ -606,18 +405,18 @@ sycl::event tri_impl(sycl::queue &exec_q, cgh.parallel_for>( sycl::range<1>(inner_range * outer_range), [=](sycl::id<1> idx) { - py::ssize_t outer_gid = idx[0] / inner_range; - py::ssize_t inner_gid = idx[0] - inner_range * outer_gid; + ssize_t outer_gid = idx[0] / inner_range; + ssize_t inner_gid = idx[0] - inner_range * outer_gid; - py::ssize_t src_inner_offset = 0, dst_inner_offset = 0; + ssize_t src_inner_offset = 0, dst_inner_offset = 0; bool to_copy(true); { using dpctl::tensor::strides::CIndexer_array; - CIndexer_array indexer_i( + CIndexer_array indexer_i( {shape_and_strides[nd_2], shape_and_strides[nd_1]}); indexer_i.set(inner_gid); - const std::array &inner = indexer_i.get(); + const std::array &inner = indexer_i.get(); src_inner_offset = inner[0] * shape_and_strides[src_s + nd_2] + inner[1] * shape_and_strides[src_s + nd_1]; @@ -631,11 +430,11 @@ sycl::event tri_impl(sycl::queue &exec_q, to_copy = (inner[0] + k <= inner[1]); } - py::ssize_t src_offset = 0; - py::ssize_t dst_offset = 0; + ssize_t src_offset = 0; + ssize_t dst_offset = 0; { using dpctl::tensor::strides::CIndexer_vector; - CIndexer_vector outer(nd - d2); + CIndexer_vector outer(nd - d2); outer.get_displacement( outer_gid, shape_and_strides, shape_and_strides + src_s, shape_and_strides + dst_s, src_offset, dst_offset); diff --git a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp index ef24b58ef2..9bf86e560b 100644 --- a/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp +++ b/dpctl/tensor/libtensor/include/kernels/copy_and_cast.hpp @@ -25,10 +25,10 @@ #pragma once #include #include -#include #include #include +#include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" @@ -42,7 +42,6 @@ namespace kernels namespace copy_and_cast { -namespace py = pybind11; using namespace dpctl::tensor::offset_utils; using dpctl::tensor::kernels::alignment_utils:: @@ -89,9 +88,9 @@ class GenericCopyFunctor void operator()(sycl::id<1> wiid) const { - const auto &offsets = indexer_(static_cast(wiid.get(0))); - const py::ssize_t &src_offset = offsets.get_first_offset(); - const py::ssize_t &dst_offset = offsets.get_second_offset(); + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); CastFnT fn{}; dst_[dst_offset] = fn(src_[src_offset]); @@ -109,11 +108,11 @@ typedef sycl::event (*copy_and_cast_generic_fn_ptr_t)( sycl::queue &, size_t, int, - const py::ssize_t *, + const ssize_t *, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &, const std::vector &); @@ -155,11 +154,11 @@ sycl::event copy_and_cast_generic_impl(sycl::queue &q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *src_p, - py::ssize_t src_offset, + ssize_t src_offset, char *dst_p, - py::ssize_t dst_offset, + ssize_t dst_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -389,13 +388,13 @@ template struct CopyAndCastContigFactory typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( sycl::queue &, size_t, - const std::array, - const std::array, - const std::array, + const std::array, + const std::array, + const std::array, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); /*! @@ -405,13 +404,13 @@ typedef sycl::event (*copy_and_cast_1d_fn_ptr_t)( typedef sycl::event (*copy_and_cast_2d_fn_ptr_t)( sycl::queue &, size_t, - const std::array, - const std::array, - const std::array, + const std::array, + const std::array, + const std::array, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); /*! @@ -447,13 +446,13 @@ template sycl::event copy_and_cast_nd_specialized_impl(sycl::queue &q, size_t nelems, - const std::array shape, - const std::array src_strides, - const std::array dst_strides, + const std::array shape, + const std::array src_strides, + const std::array dst_strides, const char *src_p, - py::ssize_t src_offset, + ssize_t src_offset, char *dst_p, - py::ssize_t dst_offset, + ssize_t dst_offset, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); @@ -528,9 +527,9 @@ class GenericCopyFromHostFunctor void operator()(sycl::id<1> wiid) const { - const auto &offsets = indexer_(static_cast(wiid.get(0))); - const py::ssize_t &src_offset = offsets.get_first_offset(); - const py::ssize_t &dst_offset = offsets.get_second_offset(); + const auto &offsets = indexer_(static_cast(wiid.get(0))); + const ssize_t &src_offset = offsets.get_first_offset(); + const ssize_t &dst_offset = offsets.get_second_offset(); CastFnT fn{}; dst_[dst_offset] = fn(src_acc_[src_offset]); @@ -541,13 +540,13 @@ typedef void (*copy_and_cast_from_host_blocking_fn_ptr_t)( sycl::queue &, size_t, int, - py::ssize_t *, + ssize_t *, const char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &, const std::vector &); @@ -594,17 +593,17 @@ void copy_and_cast_from_host_impl( sycl::queue &q, size_t nelems, int nd, - py::ssize_t *shape_and_strides, + ssize_t *shape_and_strides, const char *host_src_p, - py::ssize_t src_offset, - py::ssize_t src_min_nelem_offset, - py::ssize_t src_max_nelem_offset, + ssize_t src_offset, + ssize_t src_min_nelem_offset, + ssize_t src_max_nelem_offset, char *dst_p, - py::ssize_t dst_offset, + ssize_t dst_offset, const std::vector &depends, const std::vector &additional_depends) { - py::ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; + ssize_t nelems_range = src_max_nelem_offset - src_min_nelem_offset + 1; dpctl::tensor::type_utils::validate_type_for_device(q); dpctl::tensor::type_utils::validate_type_for_device(q); @@ -621,7 +620,7 @@ void copy_and_cast_from_host_impl( TwoOffsets_StridedIndexer indexer{ nd, src_offset - src_min_nelem_offset, dst_offset, - const_cast(shape_and_strides)}; + const_cast(shape_and_strides)}; dstTy *dst_tp = reinterpret_cast(dst_p); @@ -683,8 +682,8 @@ class GenericCopyForReshapeFunctor void operator()(sycl::id<1> wiid) const { - const py::ssize_t src_offset = src_indexer_(wiid.get(0)); - const py::ssize_t dst_offset = dst_indexer_(wiid.get(0)); + const ssize_t src_offset = src_indexer_(wiid.get(0)); + const ssize_t dst_offset = dst_indexer_(wiid.get(0)); dst_p[dst_offset] = src_p[src_offset]; } @@ -693,12 +692,12 @@ class GenericCopyForReshapeFunctor // define function type typedef sycl::event (*copy_for_reshape_fn_ptr_t)( sycl::queue &, - size_t, // num_elements - int, // src_nd - int, // dst_nd - py::ssize_t *, // packed shapes and strides - const char *, // src_data_ptr - char *, // dst_data_ptr + size_t, // num_elements + int, // src_nd + int, // dst_nd + ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + char *, // dst_data_ptr const std::vector &); /*! @@ -728,7 +727,7 @@ copy_for_reshape_generic_impl(sycl::queue &q, size_t nelems, int src_nd, int dst_nd, - py::ssize_t *packed_shapes_and_strides, + ssize_t *packed_shapes_and_strides, const char *src_p, char *dst_p, const std::vector &depends) @@ -742,12 +741,11 @@ copy_for_reshape_generic_impl(sycl::queue &q, // USM array of size 2*(src_nd + dst_nd) // [ src_shape; src_strides; dst_shape; dst_strides ] - const py::ssize_t *src_shape_and_strides = - const_cast(packed_shapes_and_strides); + const ssize_t *src_shape_and_strides = + const_cast(packed_shapes_and_strides); - const py::ssize_t *dst_shape_and_strides = - const_cast(packed_shapes_and_strides + - (2 * src_nd)); + const ssize_t *dst_shape_and_strides = const_cast( + packed_shapes_and_strides + (2 * src_nd)); StridedIndexer src_indexer{src_nd, 0, src_shape_and_strides}; StridedIndexer dst_indexer{dst_nd, 0, dst_shape_and_strides}; @@ -820,35 +818,34 @@ template struct CompositionIndexer struct RolledNDIndexer { RolledNDIndexer(int nd, - const py::ssize_t *shape, - const py::ssize_t *strides, - const py::ssize_t *ndshifts, - py::ssize_t starting_offset) + const ssize_t *shape, + const ssize_t *strides, + const ssize_t *ndshifts, + ssize_t starting_offset) : nd_(nd), shape_(shape), strides_(strides), ndshifts_(ndshifts), starting_offset_(starting_offset) { } - py::ssize_t operator()(size_t gid) const + ssize_t operator()(size_t gid) const { return compute_offset(gid); } private: int nd_ = -1; - const py::ssize_t *shape_ = nullptr; - const py::ssize_t *strides_ = nullptr; - const py::ssize_t *ndshifts_ = nullptr; - py::ssize_t starting_offset_ = 0; + const ssize_t *shape_ = nullptr; + const ssize_t *strides_ = nullptr; + const ssize_t *ndshifts_ = nullptr; + ssize_t starting_offset_ = 0; - py::ssize_t compute_offset(py::ssize_t gid) const + ssize_t compute_offset(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd_); - py::ssize_t relative_offset_(0); - _ind.get_left_rolled_displacement( + ssize_t relative_offset_(0); + _ind.get_left_rolled_displacement( gid, shape_, // shape ptr strides_, // strides ptr @@ -884,8 +881,8 @@ class StridedCopyForRollFunctor { const size_t gid = wiid.get(0); - const py::ssize_t src_offset = src_indexer_(gid); - const py::ssize_t dst_offset = dst_indexer_(gid); + const ssize_t src_offset = src_indexer_(gid); + const ssize_t dst_offset = dst_indexer_(gid); dst_p[dst_offset] = src_p[src_offset]; } @@ -894,14 +891,14 @@ class StridedCopyForRollFunctor // define function type typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( sycl::queue &, - size_t, // shift - size_t, // num_elements - int, // common_nd - const py::ssize_t *, // packed shapes and strides - const char *, // src_data_ptr - py::ssize_t, // src_offset - char *, // dst_data_ptr - py::ssize_t, // dst_offset + size_t, // shift + size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shapes and strides + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset const std::vector &); /*! @@ -929,17 +926,16 @@ typedef sycl::event (*copy_for_roll_strided_fn_ptr_t)( * @ingroup CopyAndCastKernels */ template -sycl::event -copy_for_roll_strided_impl(sycl::queue &q, - size_t shift, - size_t nelems, - int nd, - const py::ssize_t *packed_shapes_and_strides, - const char *src_p, - py::ssize_t src_offset, - char *dst_p, - py::ssize_t dst_offset, - const std::vector &depends) +sycl::event copy_for_roll_strided_impl(sycl::queue &q, + size_t shift, + size_t nelems, + int nd, + const ssize_t *packed_shapes_and_strides, + const char *src_p, + ssize_t src_offset, + char *dst_p, + ssize_t dst_offset, + const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); @@ -985,9 +981,9 @@ typedef sycl::event (*copy_for_roll_contig_fn_ptr_t)( size_t, // shift size_t, // num_elements const char *, // src_data_ptr - py::ssize_t, // src_offset + ssize_t, // src_offset char *, // dst_data_ptr - py::ssize_t, // dst_offset + ssize_t, // dst_offset const std::vector &); template class copy_for_roll_contig_kernel; @@ -1018,9 +1014,9 @@ sycl::event copy_for_roll_contig_impl(sycl::queue &q, size_t shift, size_t nelems, const char *src_p, - py::ssize_t src_offset, + ssize_t src_offset, char *dst_p, - py::ssize_t dst_offset, + ssize_t dst_offset, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); @@ -1085,13 +1081,13 @@ class copy_for_roll_ndshift_strided_kernel; // define function type typedef sycl::event (*copy_for_roll_ndshift_strided_fn_ptr_t)( sycl::queue &, - size_t, // num_elements - int, // common_nd - const py::ssize_t *, // packed shape, strides, shifts - const char *, // src_data_ptr - py::ssize_t, // src_offset - char *, // dst_data_ptr - py::ssize_t, // dst_offset + size_t, // num_elements + int, // common_nd + const ssize_t *, // packed shape, strides, shifts + const char *, // src_data_ptr + ssize_t, // src_offset + char *, // dst_data_ptr + ssize_t, // dst_offset const std::vector &); template @@ -1099,11 +1095,11 @@ sycl::event copy_for_roll_ndshift_strided_impl( sycl::queue &q, size_t nelems, int nd, - const py::ssize_t *packed_shapes_and_strides_and_shifts, + const ssize_t *packed_shapes_and_strides_and_shifts, const char *src_p, - py::ssize_t src_offset, + ssize_t src_offset, char *dst_p, - py::ssize_t dst_offset, + ssize_t dst_offset, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); @@ -1115,12 +1111,12 @@ sycl::event copy_for_roll_ndshift_strided_impl( // USM array of size 4 * nd // [ common_shape; src_strides; dst_strides; shifts ] - const py::ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; - const py::ssize_t *src_strides_ptr = + const ssize_t *shape_ptr = packed_shapes_and_strides_and_shifts; + const ssize_t *src_strides_ptr = packed_shapes_and_strides_and_shifts + nd; - const py::ssize_t *dst_strides_ptr = + const ssize_t *dst_strides_ptr = packed_shapes_and_strides_and_shifts + 2 * nd; - const py::ssize_t *shifts_ptr = + const ssize_t *shifts_ptr = packed_shapes_and_strides_and_shifts + 3 * nd; RolledNDIndexer src_indexer{nd, shape_ptr, src_strides_ptr, shifts_ptr, diff --git a/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp new file mode 100644 index 0000000000..c88d838abf --- /dev/null +++ b/dpctl/tensor/libtensor/include/kernels/dpctl_tensor_types.hpp @@ -0,0 +1,37 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once + +#include + +namespace dpctl +{ +namespace tensor +{ + +typedef std::ptrdiff_t ssize_t; + +} +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp index 9e13648163..591f9cb24f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/abs.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace abs { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -214,11 +213,11 @@ template sycl::event abs_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp index cf6875c341..236999404e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acos.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace acos { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -219,11 +218,11 @@ sycl::event acos_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp index a6ffa805d7..76d28ae92b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/acosh.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace acosh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -241,11 +240,11 @@ sycl::event acosh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp index aae69d98ea..77bb3c4d67 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/add.hpp @@ -31,12 +31,12 @@ #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace add { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -218,11 +217,11 @@ template sycl::event add_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -264,13 +263,13 @@ template sycl::event add_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -314,12 +313,12 @@ sycl::event add_contig_matrix_contig_row_broadcast_impl( size_t n0, size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] + vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< @@ -363,12 +362,12 @@ sycl::event add_contig_row_contig_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] + vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return add_contig_matrix_contig_row_broadcast_impl( @@ -456,9 +455,9 @@ sycl::event add_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -490,11 +489,11 @@ sycl::event add_inplace_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -538,9 +537,9 @@ sycl::event add_inplace_row_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_row_matrix_broadcast_impl< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp index 2759974b93..75512d80b8 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/angle.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace angle { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -151,11 +150,11 @@ sycl::event angle_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp index dc5f2c2b18..0e27841d1e 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asin.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace asin { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -243,11 +242,11 @@ sycl::event asin_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp index 6d712165a9..b774de27da 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/asinh.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace asinh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -217,11 +216,11 @@ sycl::event asinh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp index 93c9a6696d..c71498c196 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace atan { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -219,11 +218,11 @@ sycl::event atan_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp index ac8c0483c4..012eaa7ce4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atan2.hpp @@ -30,11 +30,11 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace atan2 { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -114,11 +113,11 @@ template sycl::event atan2_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -163,13 +162,13 @@ sycl::event atan2_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp index 4a26cd92b4..d227047c51 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/atanh.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace atanh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -212,11 +211,11 @@ sycl::event atanh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp index e4da56cd9e..2e3647ec9c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_and.hpp @@ -29,12 +29,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace bitwise_and { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -173,11 +172,11 @@ sycl::event bitwise_and_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -224,13 +223,13 @@ sycl::event bitwise_and_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -328,9 +327,9 @@ sycl::event bitwise_and_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -366,11 +365,11 @@ sycl::event bitwise_and_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp index cc629594b9..434089a3f0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_invert.hpp @@ -31,10 +31,10 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" namespace dpctl @@ -46,7 +46,6 @@ namespace kernels namespace bitwise_invert { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -178,11 +177,11 @@ sycl::event bitwise_invert_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp index 58ef64e16a..3748034098 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_left_shift.hpp @@ -30,12 +30,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace bitwise_left_shift { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -182,11 +181,11 @@ sycl::event bitwise_left_shift_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -235,13 +234,13 @@ sycl::event bitwise_left_shift_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -345,9 +344,9 @@ sycl::event bitwise_left_shift_inplace_contig_impl( sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -383,11 +382,11 @@ sycl::event bitwise_left_shift_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp index afd24216b2..b4738f7d5a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_or.hpp @@ -29,12 +29,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace bitwise_or { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -171,11 +170,11 @@ template sycl::event bitwise_or_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -221,13 +220,13 @@ sycl::event bitwise_or_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -324,9 +323,9 @@ sycl::event bitwise_or_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -362,11 +361,11 @@ sycl::event bitwise_or_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp index f1989b8f64..c336d949b6 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_right_shift.hpp @@ -30,12 +30,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace bitwise_right_shift { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -184,11 +183,11 @@ sycl::event bitwise_right_shift_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -237,13 +236,13 @@ sycl::event bitwise_right_shift_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -349,9 +348,9 @@ sycl::event bitwise_right_shift_inplace_contig_impl( sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -387,11 +386,11 @@ sycl::event bitwise_right_shift_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp index 7b777528c2..66d1119d79 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/bitwise_xor.hpp @@ -29,12 +29,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace bitwise_xor { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -173,11 +172,11 @@ sycl::event bitwise_xor_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -224,13 +223,13 @@ sycl::event bitwise_xor_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -328,9 +327,9 @@ sycl::event bitwise_xor_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -366,11 +365,11 @@ sycl::event bitwise_xor_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp index 21b8f79b81..a51d778490 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cbrt.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace cbrt { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; template struct CbrtFunctor @@ -142,11 +141,11 @@ sycl::event cbrt_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp index 3672de4d9c..b7b45c4877 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/ceil.hpp @@ -31,10 +31,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace ceil { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -160,11 +159,11 @@ sycl::event ceil_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp index 1794dbf721..a3e8185276 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common.hpp @@ -25,11 +25,11 @@ #pragma once #include #include -#include #include #include #include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" namespace dpctl @@ -41,8 +41,6 @@ namespace kernels namespace elementwise_common { -namespace py = pybind11; - using dpctl::tensor::kernels::alignment_utils:: disabled_sg_loadstore_wrapper_krn; using dpctl::tensor::kernels::alignment_utils::is_aligned; @@ -264,8 +262,8 @@ struct UnaryStridedFunctor void operator()(sycl::id<1> wid) const { const auto &offsets_ = inp_out_indexer_(wid.get(0)); - const py::ssize_t &inp_offset = offsets_.get_first_offset(); - const py::ssize_t &res_offset = offsets_.get_second_offset(); + const ssize_t &inp_offset = offsets_.get_first_offset(); + const ssize_t &res_offset = offsets_.get_second_offset(); UnaryOpT op{}; @@ -342,11 +340,11 @@ sycl::event unary_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -533,7 +531,7 @@ struct BinaryStridedFunctor void operator()(sycl::id<1> wid) const { const auto &three_offsets_ = - three_offsets_indexer_(static_cast(wid.get(0))); + three_offsets_indexer_(static_cast(wid.get(0))); const auto &inp1_offset = three_offsets_.get_first_offset(); const auto &inp2_offset = three_offsets_.get_second_offset(); @@ -685,11 +683,11 @@ typedef sycl::event (*unary_strided_impl_fn_ptr_t)( sycl::queue &, size_t, int, - const py::ssize_t *, + const ssize_t *, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &, const std::vector &); @@ -697,24 +695,24 @@ typedef sycl::event (*binary_contig_impl_fn_ptr_t)( sycl::queue &, size_t, const char *, - py::ssize_t, + ssize_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); typedef sycl::event (*binary_strided_impl_fn_ptr_t)( sycl::queue &, size_t, int, - const py::ssize_t *, + const ssize_t *, const char *, - py::ssize_t, + ssize_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &, const std::vector &); @@ -724,11 +722,11 @@ typedef sycl::event (*binary_contig_matrix_contig_row_broadcast_impl_fn_ptr_t)( size_t, size_t, const char *, - py::ssize_t, + ssize_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)( @@ -737,11 +735,11 @@ typedef sycl::event (*binary_contig_row_contig_matrix_broadcast_impl_fn_ptr_t)( size_t, size_t, const char *, - py::ssize_t, + ssize_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); template &depends = {}) { sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -831,13 +829,13 @@ sycl::event binary_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -877,12 +875,12 @@ sycl::event binary_contig_matrix_contig_row_broadcast_impl( size_t n0, size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = op(mat[i,j], vec[j]) - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { const argT1 *mat = reinterpret_cast(mat_p) + mat_offset; @@ -955,12 +953,12 @@ sycl::event binary_contig_row_contig_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = op(vec[j], mat[i,j]) - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { const argT1 *vec = reinterpret_cast(vec_p) + vec_offset; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp index deaef5522f..86dc2ec60a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/common_inplace.hpp @@ -26,10 +26,10 @@ #pragma once #include #include -#include #include #include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" namespace dpctl { @@ -190,7 +190,7 @@ struct BinaryInplaceStridedFunctor void operator()(sycl::id<1> wid) const { const auto &two_offsets_ = - two_offsets_indexer_(static_cast(wid.get(0))); + two_offsets_indexer_(static_cast(wid.get(0))); const auto &inp_offset = two_offsets_.get_first_offset(); const auto &lhs_offset = two_offsets_.get_second_offset(); @@ -261,20 +261,20 @@ typedef sycl::event (*binary_inplace_contig_impl_fn_ptr_t)( sycl::queue &, size_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); typedef sycl::event (*binary_inplace_strided_impl_fn_ptr_t)( sycl::queue &, size_t, int, - const py::ssize_t *, + const ssize_t *, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &, const std::vector &); @@ -284,9 +284,9 @@ typedef sycl::event (*binary_inplace_row_matrix_broadcast_impl_fn_ptr_t)( size_t, size_t, const char *, - py::ssize_t, + ssize_t, char *, - py::ssize_t, + ssize_t, const std::vector &); template &depends = {}) { sycl::event comp_ev = exec_q.submit([&](sycl::handler &cgh) { @@ -360,11 +360,11 @@ sycl::event binary_inplace_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *rhs_p, - py::ssize_t rhs_offset, + ssize_t rhs_offset, char *lhs_p, - py::ssize_t lhs_offset, + ssize_t lhs_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -399,9 +399,9 @@ sycl::event binary_inplace_row_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const std::vector &depends = {}) { const argT *vec = reinterpret_cast(vec_p) + vec_offset; diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp index b8ebf34a23..24f00a1043 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/conj.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace conj { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -171,11 +170,11 @@ sycl::event conj_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp index 00e926d3d8..77cd962f0a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/copysign.hpp @@ -30,11 +30,11 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace copysign { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -127,11 +126,11 @@ template sycl::event copysign_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -176,13 +175,13 @@ sycl::event copysign_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp index e804d5f3df..ab1b55f3cd 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cos.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace cos { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -244,11 +243,11 @@ template sycl::event cos_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp index 16406d5547..80ac46cdf2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/cosh.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace cosh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -233,11 +232,11 @@ sycl::event cosh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp index c354b612a7..146d20a0d7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/equal.hpp @@ -31,11 +31,11 @@ #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace equal { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -179,11 +178,11 @@ template sycl::event equal_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -227,13 +226,13 @@ sycl::event equal_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp index 66e84b69bf..99c78c19c1 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace exp { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -201,11 +200,11 @@ template sycl::event exp_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp index ae590fc2bf..d63693ff12 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/exp2.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace exp2 { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -204,11 +203,11 @@ sycl::event exp2_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp index 518aacfe6b..abcb51f8d3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/expm1.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace expm1 { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -214,11 +213,11 @@ sycl::event expm1_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp index a8f810d9ac..a55fcfc565 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor.hpp @@ -31,10 +31,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace floor { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -160,11 +159,11 @@ sycl::event floor_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp index d09f376d04..2395d9180a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/floor_divide.hpp @@ -30,12 +30,12 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace floor_divide { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -216,11 +215,11 @@ sycl::event floor_divide_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -267,13 +266,13 @@ sycl::event floor_divide_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -410,9 +409,9 @@ sycl::event floor_divide_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -448,11 +447,11 @@ sycl::event floor_divide_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp index 2ba942fb32..98bd76248a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater.hpp @@ -32,11 +32,11 @@ #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace greater { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -176,11 +175,11 @@ template sycl::event greater_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -224,13 +223,13 @@ sycl::event greater_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp index 48503c608d..afa7a1bed5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/greater_equal.hpp @@ -32,11 +32,11 @@ #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace greater_equal { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -178,11 +177,11 @@ sycl::event greater_equal_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -230,13 +229,13 @@ sycl::event greater_equal_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp index 64d8a8f059..e7cccaf211 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/hypot.hpp @@ -30,11 +30,11 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace hypot { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -129,11 +128,11 @@ template sycl::event hypot_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -178,13 +177,13 @@ sycl::event hypot_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp index 03ed7bad78..47fcd5b6b4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/imag.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace imag { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -163,11 +162,11 @@ sycl::event imag_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp index b0dab6249c..17ae5cf43b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isfinite.hpp @@ -31,9 +31,8 @@ #include #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -44,7 +43,6 @@ namespace kernels namespace isfinite { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -161,11 +159,11 @@ sycl::event isfinite_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp index 8c805b7934..7a3c24a553 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isinf.hpp @@ -30,10 +30,10 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -44,7 +44,6 @@ namespace kernels namespace isinf { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -158,11 +157,11 @@ sycl::event isinf_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp index 8b10ce2295..1a20e38036 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/isnan.hpp @@ -29,10 +29,10 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -43,7 +43,6 @@ namespace kernels namespace isnan { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -156,11 +155,11 @@ sycl::event isnan_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp index 1827ca3185..7e9634dba0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less.hpp @@ -29,13 +29,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace less { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -175,11 +174,11 @@ template sycl::event less_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -222,13 +221,13 @@ sycl::event less_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp index 0b6d06fff3..4964715da3 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/less_equal.hpp @@ -32,11 +32,10 @@ #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -47,7 +46,6 @@ namespace kernels namespace less_equal { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -176,11 +174,11 @@ template sycl::event less_equal_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -225,13 +223,13 @@ sycl::event less_equal_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp index 48ec92a257..d8a7c0350b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace log { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -159,11 +158,11 @@ template sycl::event log_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp index 6f3c9d1925..ab53ec5b73 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log10.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace log10 { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -179,11 +178,11 @@ sycl::event log10_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp index 3b417b46b9..af36ecda79 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log1p.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace log1p { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -180,11 +179,11 @@ sycl::event log1p_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp index 079c5bf94b..1c1d274b47 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/log2.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace log2 { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -179,11 +178,11 @@ sycl::event log2_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp index 9fb3759779..aee0ae6b7f 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logaddexp.hpp @@ -31,13 +31,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace logaddexp { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -145,11 +144,11 @@ template sycl::event logaddexp_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -195,13 +194,13 @@ sycl::event logaddexp_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp index 135c264751..60e0f133c2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_and.hpp @@ -30,12 +30,12 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace logical_and { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -171,11 +170,11 @@ sycl::event logical_and_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -222,13 +221,13 @@ sycl::event logical_and_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp index 8a820c5172..959b5aab01 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_not.hpp @@ -30,10 +30,10 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -44,7 +44,6 @@ namespace kernels namespace logical_not { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -134,11 +133,11 @@ sycl::event logical_not_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp index a444ced41f..f3ca6cd4d5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_or.hpp @@ -30,12 +30,12 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace logical_or { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -169,11 +168,11 @@ template sycl::event logical_or_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -219,13 +218,13 @@ sycl::event logical_or_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp index a3d175f413..0ee26837be 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/logical_xor.hpp @@ -30,12 +30,12 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace logical_xor { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -172,11 +171,11 @@ sycl::event logical_xor_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -223,13 +222,13 @@ sycl::event logical_xor_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp index 22a63882a9..da32fb6f7b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/maximum.hpp @@ -29,13 +29,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace maximum { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -193,11 +192,11 @@ template sycl::event maximum_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -242,13 +241,13 @@ sycl::event maximum_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp index a11a36aeee..c6e5e841c2 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/minimum.hpp @@ -29,13 +29,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace minimum { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -193,11 +192,11 @@ template sycl::event minimum_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -242,13 +241,13 @@ sycl::event minimum_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp index 7aeea6fdc7..7e7dd13c1c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/multiply.hpp @@ -30,14 +30,14 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace multiply { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -198,11 +197,11 @@ template sycl::event multiply_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -248,13 +247,13 @@ sycl::event multiply_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -301,12 +300,12 @@ sycl::event multiply_contig_matrix_contig_row_broadcast_impl( size_t n0, size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] * vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< @@ -351,12 +350,12 @@ sycl::event multiply_contig_row_contig_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] * vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return multiply_contig_matrix_contig_row_broadcast_impl( @@ -446,9 +445,9 @@ sycl::event multiply_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -484,11 +483,11 @@ sycl::event multiply_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -535,9 +534,9 @@ sycl::event multiply_inplace_row_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_row_matrix_broadcast_impl< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp index b67e74438f..2a51c0bbb4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/negative.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace negative { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -154,11 +153,11 @@ sycl::event negative_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp index a5bc3a6cc6..c119289690 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/not_equal.hpp @@ -29,12 +29,12 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace not_equal { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -179,11 +178,11 @@ template sycl::event not_equal_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -228,13 +227,13 @@ sycl::event not_equal_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp index 6136a55bce..8ee09d409a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/positive.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace positive { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -169,11 +168,11 @@ sycl::event positive_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp index 65214c9533..6db8d9fcb7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/pow.hpp @@ -30,14 +30,14 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace pow { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -251,11 +250,11 @@ template sycl::event pow_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -298,13 +297,13 @@ template sycl::event pow_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -458,9 +457,9 @@ sycl::event pow_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -492,11 +491,11 @@ sycl::event pow_inplace_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp index 2fe1d70bd5..c4553310d4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/proj.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace proj { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -194,11 +193,11 @@ sycl::event proj_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp index 94cdaf1496..e2418553c7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/real.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace real { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -163,11 +162,11 @@ sycl::event real_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp index ecc5959fc3..7e1e27f5ea 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/reciprocal.hpp @@ -32,13 +32,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" -#include namespace dpctl { @@ -49,7 +49,6 @@ namespace kernels namespace reciprocal { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -167,11 +166,11 @@ sycl::event reciprocal_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp index b25c7be91f..5b1c6cc815 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/remainder.hpp @@ -30,13 +30,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace remainder { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -232,11 +231,11 @@ template sycl::event remainder_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -281,13 +280,13 @@ sycl::event remainder_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -434,9 +433,9 @@ sycl::event remainder_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -472,11 +471,11 @@ sycl::event remainder_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp index e8221e4c25..c0340ef13a 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/round.hpp @@ -31,10 +31,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace round { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -171,11 +170,11 @@ sycl::event round_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp index a7f70337f8..9d4a28fe52 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/rsqrt.hpp @@ -35,10 +35,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -49,7 +49,6 @@ namespace kernels namespace rsqrt { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; template struct RsqrtFunctor @@ -145,11 +144,11 @@ sycl::event rsqrt_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp index 804b6260f1..2cc6887b1b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sign.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace sign { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -192,11 +191,11 @@ sycl::event sign_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp index 401db90b63..a0f474c293 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/signbit.hpp @@ -30,10 +30,10 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -44,7 +44,6 @@ namespace kernels namespace signbit { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -151,11 +150,11 @@ sycl::event signbit_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp index debdfa5fca..37b718f7b4 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sin.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace sin { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -265,11 +264,11 @@ template sycl::event sin_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp index cd7f998afa..0883a6dcc0 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sinh.hpp @@ -32,10 +32,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace sinh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -235,11 +234,11 @@ sycl::event sinh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp index 3eea801fb2..970e215591 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/sqrt.hpp @@ -35,10 +35,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -49,7 +49,6 @@ namespace kernels namespace sqrt { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -343,11 +342,11 @@ sycl::event sqrt_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp index 656ec47c4b..b6650b65e7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/square.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace square { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -186,11 +185,11 @@ sycl::event square_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp index a24e71e9d2..544d91c02b 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/subtract.hpp @@ -29,13 +29,13 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -46,7 +46,6 @@ namespace kernels namespace subtract { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -180,11 +179,11 @@ template sycl::event subtract_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -229,13 +228,13 @@ sycl::event subtract_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -293,12 +292,12 @@ sycl::event subtract_contig_matrix_contig_row_broadcast_impl( size_t n0, size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] - vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< @@ -346,12 +345,12 @@ sycl::event subtract_contig_row_contig_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = op(vec[j], mat[i,j]) - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< @@ -443,9 +442,9 @@ sycl::event subtract_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -481,11 +480,11 @@ sycl::event subtract_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -532,9 +531,9 @@ sycl::event subtract_inplace_row_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_row_matrix_broadcast_impl< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp index e2d08cba0d..d944e43bb5 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tan.hpp @@ -33,10 +33,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace tan { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -209,11 +208,11 @@ template sycl::event tan_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp index 13ea6c7eee..d0ee54fe8c 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/tanh.hpp @@ -34,10 +34,10 @@ #include "kernels/elementwise_functions/common.hpp" #include "sycl_complex.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -48,7 +48,6 @@ namespace kernels namespace tanh { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -205,11 +204,11 @@ sycl::event tanh_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp index e063ecef54..ab06a52229 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/true_divide.hpp @@ -29,14 +29,14 @@ #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "sycl_complex.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" #include "kernels/elementwise_functions/common.hpp" #include "kernels/elementwise_functions/common_inplace.hpp" -#include namespace dpctl { @@ -47,7 +47,6 @@ namespace kernels namespace true_divide { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace tu_ns = dpctl::tensor::type_utils; @@ -200,11 +199,11 @@ sycl::event true_divide_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_impl< @@ -250,13 +249,13 @@ sycl::event true_divide_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg1_p, - py::ssize_t arg1_offset, + ssize_t arg1_offset, const char *arg2_p, - py::ssize_t arg2_offset, + ssize_t arg2_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -315,12 +314,12 @@ sycl::event true_divide_contig_matrix_contig_row_broadcast_impl( size_t n0, size_t n1, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] / vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_matrix_contig_row_broadcast_impl< @@ -368,12 +367,12 @@ sycl::event true_divide_contig_row_contig_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, const char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, char *res_p, // typeless pointer to (n0, n1) result C-contig. matrix, // res[i,j] = mat[i,j] + vec[j] - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_contig_row_contig_matrix_broadcast_impl< @@ -541,9 +540,9 @@ sycl::event true_divide_inplace_contig_impl(sycl::queue &exec_q, size_t nelems, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_contig_impl< @@ -579,11 +578,11 @@ sycl::event true_divide_inplace_strided_impl( sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { @@ -630,9 +629,9 @@ sycl::event true_divide_inplace_row_matrix_broadcast_impl( size_t n0, size_t n1, const char *vec_p, // typeless pointer to (n1,) contiguous row - py::ssize_t vec_offset, + ssize_t vec_offset, char *mat_p, // typeless pointer to (n0, n1) C-contiguous matrix - py::ssize_t mat_offset, + ssize_t mat_offset, const std::vector &depends = {}) { return elementwise_common::binary_inplace_row_matrix_broadcast_impl< diff --git a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp index 35b0783719..b27792fda7 100644 --- a/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp +++ b/dpctl/tensor/libtensor/include/kernels/elementwise_functions/trunc.hpp @@ -31,10 +31,10 @@ #include "kernels/elementwise_functions/common.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -#include namespace dpctl { @@ -45,7 +45,6 @@ namespace kernels namespace trunc { -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; using dpctl::tensor::type_utils::is_complex; @@ -157,11 +156,11 @@ sycl::event trunc_strided_impl(sycl::queue &exec_q, size_t nelems, int nd, - const py::ssize_t *shape_and_strides, + const ssize_t *shape_and_strides, const char *arg_p, - py::ssize_t arg_offset, + ssize_t arg_offset, char *res_p, - py::ssize_t res_offset, + ssize_t res_offset, const std::vector &depends, const std::vector &additional_depends) { diff --git a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp index 769774f4dd..07463b118e 100644 --- a/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp +++ b/dpctl/tensor/libtensor/include/kernels/integer_advanced_indexing.hpp @@ -26,10 +26,10 @@ #include #include #include -#include #include #include +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" @@ -42,7 +42,6 @@ namespace kernels namespace indexing { -namespace py = pybind11; using namespace dpctl::tensor::offset_utils; template (max_item, 1); - ind = std::clamp(ind, -max_item, max_item - 1); + max_item = std::max(max_item, 1); + ind = std::clamp(ind, -max_item, max_item - 1); ind = (ind < 0) ? ind + max_item : ind; return; } @@ -79,10 +78,10 @@ class ClipIndex public: ClipIndex() = default; - void operator()(py::ssize_t max_item, py::ssize_t &ind) const + void operator()(ssize_t max_item, ssize_t &ind) const { - max_item = std::max(max_item, 1); - ind = std::clamp(ind, 0, max_item - 1); + max_item = std::max(max_item, 1); + ind = std::clamp(ind, 0, max_item - 1); return; } }; @@ -101,7 +100,7 @@ class TakeFunctor char **ind_ = nullptr; int k_ = 0; size_t ind_nelems_ = 0; - const py::ssize_t *axes_shape_and_strides_ = nullptr; + const ssize_t *axes_shape_and_strides_ = nullptr; OrthogStrider orthog_strider; IndicesStrider ind_strider; AxesStrider axes_strider; @@ -112,7 +111,7 @@ class TakeFunctor char **ind_cp, int k, size_t ind_nelems, - const py::ssize_t *axes_shape_and_strides, + const ssize_t *axes_shape_and_strides, OrthogStrider orthog_strider_, IndicesStrider ind_strider_, AxesStrider axes_strider_) @@ -129,20 +128,20 @@ class TakeFunctor const T *src = reinterpret_cast(src_); T *dst = reinterpret_cast(dst_); - py::ssize_t i_orthog = id / ind_nelems_; - py::ssize_t i_along = id - (i_orthog * ind_nelems_); + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); auto orthog_offsets = orthog_strider(i_orthog); - py::ssize_t src_offset = orthog_offsets.get_first_offset(); - py::ssize_t dst_offset = orthog_offsets.get_second_offset(); + ssize_t src_offset = orthog_offsets.get_first_offset(); + ssize_t dst_offset = orthog_offsets.get_second_offset(); ProjectorT proj{}; for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { indT *ind_data = reinterpret_cast(ind_[axis_idx]); - py::ssize_t ind_offset = ind_strider(i_along, axis_idx); - py::ssize_t i = static_cast(ind_data[ind_offset]); + ssize_t ind_offset = ind_strider(i_along, axis_idx); + ssize_t i = static_cast(ind_data[ind_offset]); proj(axes_shape_and_strides_[axis_idx], i); @@ -161,15 +160,15 @@ typedef sycl::event (*take_fn_ptr_t)(sycl::queue &, int, int, int, - const py::ssize_t *, - const py::ssize_t *, - const py::ssize_t *, + const ssize_t *, + const ssize_t *, + const ssize_t *, const char *, char *, char **, - py::ssize_t, - py::ssize_t, - const py::ssize_t *, + ssize_t, + ssize_t, + const ssize_t *, const std::vector &); template @@ -179,15 +178,15 @@ sycl::event take_impl(sycl::queue &q, int nd, int ind_nd, int k, - const py::ssize_t *orthog_shape_and_strides, - const py::ssize_t *axes_shape_and_strides, - const py::ssize_t *ind_shape_and_strides, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, const char *src_p, char *dst_p, char **ind_p, - py::ssize_t src_offset, - py::ssize_t dst_offset, - const py::ssize_t *ind_offsets, + ssize_t src_offset, + ssize_t dst_offset, + const ssize_t *ind_offsets, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); @@ -231,7 +230,7 @@ class PutFunctor char **ind_ = nullptr; int k_ = 0; size_t ind_nelems_ = 0; - const py::ssize_t *axes_shape_and_strides_ = nullptr; + const ssize_t *axes_shape_and_strides_ = nullptr; OrthogStrider orthog_strider; IndicesStrider ind_strider; AxesStrider axes_strider; @@ -242,7 +241,7 @@ class PutFunctor char **ind_cp, int k, size_t ind_nelems, - const py::ssize_t *axes_shape_and_strides, + const ssize_t *axes_shape_and_strides, OrthogStrider orthog_strider_, IndicesStrider ind_strider_, AxesStrider axes_strider_) @@ -259,20 +258,20 @@ class PutFunctor T *dst = reinterpret_cast(dst_); const T *val = reinterpret_cast(val_); - py::ssize_t i_orthog = id / ind_nelems_; - py::ssize_t i_along = id - (i_orthog * ind_nelems_); + ssize_t i_orthog = id / ind_nelems_; + ssize_t i_along = id - (i_orthog * ind_nelems_); auto orthog_offsets = orthog_strider(i_orthog); - py::ssize_t dst_offset = orthog_offsets.get_first_offset(); - py::ssize_t val_offset = orthog_offsets.get_second_offset(); + ssize_t dst_offset = orthog_offsets.get_first_offset(); + ssize_t val_offset = orthog_offsets.get_second_offset(); ProjectorT proj{}; for (int axis_idx = 0; axis_idx < k_; ++axis_idx) { indT *ind_data = reinterpret_cast(ind_[axis_idx]); - py::ssize_t ind_offset = ind_strider(i_along, axis_idx); - py::ssize_t i = static_cast(ind_data[ind_offset]); + ssize_t ind_offset = ind_strider(i_along, axis_idx); + ssize_t i = static_cast(ind_data[ind_offset]); proj(axes_shape_and_strides_[axis_idx], i); @@ -291,15 +290,15 @@ typedef sycl::event (*put_fn_ptr_t)(sycl::queue &, int, int, int, - const py::ssize_t *, - const py::ssize_t *, - const py::ssize_t *, + const ssize_t *, + const ssize_t *, + const ssize_t *, char *, const char *, char **, - py::ssize_t, - py::ssize_t, - const py::ssize_t *, + ssize_t, + ssize_t, + const ssize_t *, const std::vector &); template @@ -309,15 +308,15 @@ sycl::event put_impl(sycl::queue &q, int nd, int ind_nd, int k, - const py::ssize_t *orthog_shape_and_strides, - const py::ssize_t *axes_shape_and_strides, - const py::ssize_t *ind_shape_and_strides, + const ssize_t *orthog_shape_and_strides, + const ssize_t *axes_shape_and_strides, + const ssize_t *ind_shape_and_strides, char *dst_p, const char *val_p, char **ind_p, - py::ssize_t dst_offset, - py::ssize_t val_offset, - const py::ssize_t *ind_offsets, + ssize_t dst_offset, + ssize_t val_offset, + const ssize_t *ind_offsets, const std::vector &depends) { dpctl::tensor::type_utils::validate_type_for_device(q); diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp index 15e5e35d67..039417d6a5 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/dot_product.hpp @@ -1,15 +1,15 @@ #pragma once -#include #include #include #include +#include #include #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/reductions.hpp" -#include "pybind11/pybind11.h" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" @@ -53,9 +53,9 @@ struct SequentialDotProduct { auto const &batch_offsets = batch_indexer_(id[0]); - const py::ssize_t &lhs_batch_offset = batch_offsets.get_first_offset(); - const py::ssize_t &rhs_batch_offset = batch_offsets.get_second_offset(); - const py::ssize_t &out_batch_offset = batch_offsets.get_third_offset(); + const ssize_t &lhs_batch_offset = batch_offsets.get_first_offset(); + const ssize_t &rhs_batch_offset = batch_offsets.get_second_offset(); + const ssize_t &out_batch_offset = batch_offsets.get_third_offset(); outT red_val(0); for (size_t m = 0; m < reduction_max_gid_; ++m) { @@ -180,14 +180,14 @@ typedef sycl::event (*dot_product_impl_fn_ptr_t)( const char *, char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, const std::vector &); template @@ -198,14 +198,14 @@ sycl::event dot_product_impl(sycl::queue &exec_q, const char *rhs_cp, char *res_cp, int batch_nd, - const py::ssize_t *batch_shape_and_strides, - py::ssize_t batch_lhs_offset, - py::ssize_t batch_rhs_offset, - py::ssize_t batch_res_offset, + const ssize_t *batch_shape_and_strides, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_lhs_offset, - py::ssize_t reduction_rhs_offset, + const ssize_t *reduction_shape_stride, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, const std::vector &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -250,8 +250,8 @@ sycl::event dot_product_impl(sycl::queue &exec_q, using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; - const py::ssize_t *const &res_shape = batch_shape_and_strides; - const py::ssize_t *const &res_strides = + const ssize_t *const &res_shape = batch_shape_and_strides; + const ssize_t *const &res_strides = batch_shape_and_strides + 3 * batch_nd; IndexerT res_indexer(batch_nd, batch_res_offset, res_shape, res_strides); @@ -317,11 +317,11 @@ typedef sycl::event (*dot_product_contig_impl_fn_ptr_t)( const char *, const char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -332,11 +332,11 @@ dot_product_contig_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - py::ssize_t batch_lhs_offset, - py::ssize_t batch_rhs_offset, - py::ssize_t batch_res_offset, - py::ssize_t reduction_lhs_offset, - py::ssize_t reduction_rhs_offset, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, const std::vector &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp) + @@ -364,8 +364,8 @@ dot_product_contig_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(reduction_nelems), - static_cast(batches)}; + 0, static_cast(reduction_nelems), + static_cast(batches)}; InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; @@ -404,8 +404,8 @@ dot_product_contig_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(reduction_nelems), - static_cast(batches)}; + 0, static_cast(reduction_nelems), + static_cast(batches)}; InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; @@ -543,14 +543,14 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, const char *rhs_cp, char *res_cp, int batch_nd, - const py::ssize_t *batch_shape_and_strides, - py::ssize_t batch_lhs_offset, - py::ssize_t batch_rhs_offset, - py::ssize_t batch_res_offset, + const ssize_t *batch_shape_and_strides, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_lhs_offset, - py::ssize_t reduction_rhs_offset, + const ssize_t *reduction_shape_stride, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, const std::vector &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -743,8 +743,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(reduction_groups_)}; + 0, static_cast(batches), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{ @@ -785,8 +785,8 @@ sycl::event dot_product_tree_impl(sycl::queue &exec_q, using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(batches), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{batch_nd, batch_res_offset, /* shape */ batch_shape_and_strides, /* strides */ batch_shape_and_strides + @@ -842,11 +842,11 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, const char *lhs_cp, const char *rhs_cp, char *res_cp, - py::ssize_t batch_lhs_offset, - py::ssize_t batch_rhs_offset, - py::ssize_t batch_res_offset, - py::ssize_t reduction_lhs_offset, - py::ssize_t reduction_rhs_offset, + ssize_t batch_lhs_offset, + ssize_t batch_rhs_offset, + ssize_t batch_res_offset, + ssize_t reduction_lhs_offset, + ssize_t reduction_rhs_offset, const std::vector &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp) + @@ -874,8 +874,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(reduction_nelems), - static_cast(batches)}; + 0, static_cast(reduction_nelems), + static_cast(batches)}; InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; @@ -916,8 +916,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(reduction_nelems), - static_cast(batches)}; + 0, static_cast(reduction_nelems), + static_cast(batches)}; InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; @@ -994,8 +994,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, NoOpIndexerT, NoOpIndexerT>; InputBatchIndexerT inp_batch_indexer{ - 0, static_cast(reduction_nelems), - static_cast(batches)}; + 0, static_cast(reduction_nelems), + static_cast(batches)}; InputOutputBatchIndexerT inp_out_batch_indexer{ inp_batch_indexer, inp_batch_indexer, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; @@ -1045,8 +1045,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(reduction_groups_)}; + 0, static_cast(batches), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{ @@ -1086,8 +1086,8 @@ dot_product_contig_tree_impl(sycl::queue &exec_q, using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(batches), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(batches), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, diff --git a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp index a4a5d3b929..0d90917885 100644 --- a/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp +++ b/dpctl/tensor/libtensor/include/kernels/linalg_functions/gemm.hpp @@ -1,15 +1,15 @@ #pragma once -#include #include #include #include +#include #include #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "kernels/reductions.hpp" -#include "pybind11/pybind11.h" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" #include "utils/type_utils.hpp" @@ -89,8 +89,8 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, size_t preferred_reductions_per_wi, size_t reductions_per_wi, int res_nd, - py::ssize_t res_offset, - const py::ssize_t *res_shapes_strides, + ssize_t res_offset, + const ssize_t *res_shapes_strides, const std::vector &depends) { sycl::event red_ev; @@ -110,8 +110,8 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, res_iter_indexer}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; sycl::range<1> iter_range{iter_nelems}; @@ -141,8 +141,8 @@ sycl::event single_reduction_for_gemm(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, res_iter_indexer}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; if (iter_nelems == 1) { // increase GPU occupancy @@ -205,8 +205,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; sycl::range<1> iter_range{iter_nelems}; @@ -234,8 +234,8 @@ single_reduction_for_gemm_contig(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; if (iter_nelems == 1) { // increase GPU occupancy @@ -282,8 +282,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, size_t preferred_reductions_per_wi, size_t reductions_per_wi, int res_nd, - py::ssize_t res_offset, - const py::ssize_t *res_shape_strides, + ssize_t res_offset, + const ssize_t *res_shape_strides, const std::vector &depends) { @@ -305,8 +305,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; @@ -346,9 +346,8 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, InputIndexerT, ResIndexerT>; using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + InputIndexerT inp_indexer{0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -390,10 +389,10 @@ sycl::event tree_reduction_for_gemm(sycl::queue &exec_q, using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; - ResIndexerT res_iter_indexer{ - res_nd, static_cast(res_offset), res_shape_strides}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; + ResIndexerT res_iter_indexer{res_nd, static_cast(res_offset), + res_shape_strides}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, res_iter_indexer}; @@ -463,8 +462,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; auto localRange = sycl::range<1>{wg}; @@ -507,9 +506,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, // n * m = iter_nelems because essentially, this process // creates a stack of reduction_nelems 2D matrices and we reduce // along the stack axis - InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + InputIndexerT inp_indexer{0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -551,8 +549,8 @@ tree_reduction_for_gemm_contig(sycl::queue &exec_q, using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1172,19 +1170,19 @@ class gemm_nm_krn; typedef sycl::event (*gemm_impl_fn_ptr_t)( sycl::queue &, - const char *, // lhs - const char *, // rhs - char *, // res - size_t, // lhs_outer_nelems (n) - size_t, // inner_nelems (k) - size_t, // rhs_outer_nelems (m) - int, // inner nd - int, // lhs outer nd - const py::ssize_t *, // lhs shape and strides - int, // rhs outer nd - const py::ssize_t *, // rhs shape and strides - int, // res outer nd - const py::ssize_t *, // res shape and strides + const char *, // lhs + const char *, // rhs + char *, // res + size_t, // lhs_outer_nelems (n) + size_t, // inner_nelems (k) + size_t, // rhs_outer_nelems (m) + int, // inner nd + int, // lhs outer nd + const ssize_t *, // lhs shape and strides + int, // rhs outer nd + const ssize_t *, // rhs shape and strides + int, // res outer nd + const ssize_t *, // res shape and strides std::vector const &); template @@ -1197,11 +1195,11 @@ sycl::event gemm_impl(sycl::queue &exec_q, size_t m, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_shape_strides, + const ssize_t *lhs_shape_strides, int rhs_outer_nd, - const py::ssize_t *rhs_shape_strides, + const ssize_t *rhs_shape_strides, int res_outer_nd, - const py::ssize_t *res_shape_strides, + const ssize_t *res_shape_strides, std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -2127,11 +2125,11 @@ sycl::event gemm_tree_k_impl(sycl::queue &exec_q, size_t m, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_nd, - const py::ssize_t *res_shapes_strides, + const ssize_t *res_shapes_strides, const std::vector &depends) { size_t delta_k(4); @@ -2432,11 +2430,11 @@ sycl::event gemm_tree_nm_impl(sycl::queue &exec_q, size_t m, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_nd, - const py::ssize_t *res_shapes_strides, + const ssize_t *res_shapes_strides, const std::vector &depends) { constexpr int wi_delta_n = 2; @@ -2782,11 +2780,11 @@ sycl::event gemm_tree_impl(sycl::queue &exec_q, size_t m, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_nd, - const py::ssize_t *res_shapes_strides, + const ssize_t *res_shapes_strides, std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -3601,8 +3599,7 @@ class GemmBatchFunctorThreadNM const size_t gr_id = it.get_group_linear_id() - m_id * n_groups_per_batch; - const auto &three_offsets_ = - batch_indexer(static_cast(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); // lift group_id to (block_i, block_j, block_s), // 0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s @@ -3789,8 +3786,7 @@ class GemmBatchFunctorThreadNM(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); // lift group_id to (block_i, block_j, block_s), // 0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s @@ -3961,8 +3957,7 @@ class GemmBatchFunctorThreadK it.get_group_linear_id() - m_id * n_groups_per_batch; size_t lid = it.get_local_linear_id(); - const auto &three_offsets_ = - batch_indexer(static_cast(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); const auto &lhs_offset = three_offsets_.get_first_offset(); const auto &rhs_offset = three_offsets_.get_second_offset(); @@ -4130,8 +4125,7 @@ class GemmBatchFunctorThreadK(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); const auto &lhs_offset = three_offsets_.get_first_offset(); const auto &rhs_offset = three_offsets_.get_second_offset(); @@ -4225,26 +4219,26 @@ class gemm_batch_nm_krn; typedef sycl::event (*gemm_batch_impl_fn_ptr_t)( sycl::queue &, - const char *, // lhs - const char *, // rhs - char *, // res - size_t, // batch nelems - size_t, // lhs outer nelems (n) - size_t, // inner nelems (k) - size_t, // rhs outer nelems (m) - int, // batching nd - const py::ssize_t *, // batch shape strides - py::ssize_t, // lhs batch offset - py::ssize_t, // rhs batch offset - py::ssize_t, // res batch offset - int, // inner dims - int, // lhs outer dims - const py::ssize_t *, // lhs outer and inner shape and strides - int, // rhs outer dims - const py::ssize_t *, // rhs outer and inner shape and strides - int, // res outer dims - const py::ssize_t *, // res outer and inner shape and strides - const py::ssize_t *, // res full shape and strides + const char *, // lhs + const char *, // rhs + char *, // res + size_t, // batch nelems + size_t, // lhs outer nelems (n) + size_t, // inner nelems (k) + size_t, // rhs outer nelems (m) + int, // batching nd + const ssize_t *, // batch shape strides + ssize_t, // lhs batch offset + ssize_t, // rhs batch offset + ssize_t, // res batch offset + int, // inner dims + int, // lhs outer dims + const ssize_t *, // lhs outer and inner shape and strides + int, // rhs outer dims + const ssize_t *, // rhs outer and inner shape and strides + int, // res outer dims + const ssize_t *, // res outer and inner shape and strides + const ssize_t *, // res full shape and strides std::vector const &); template @@ -4257,18 +4251,18 @@ sycl::event gemm_batch_impl(sycl::queue &exec_q, size_t k, size_t m, int batch_nd, - const py::ssize_t *batch_shape_strides, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_outer_nd, - const py::ssize_t *res_outer_shapes_strides, - const py::ssize_t *res_shape_strides, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); @@ -4461,9 +4455,9 @@ typedef sycl::event (*gemm_batch_contig_impl_fn_ptr_t)( size_t, // n size_t, // k size_t, // m - py::ssize_t, // lhs batch offset - py::ssize_t, // rhs batch offset - py::ssize_t, // res batch offset + ssize_t, // lhs batch offset + ssize_t, // rhs batch offset + ssize_t, // res batch offset std::vector const &); template @@ -4475,9 +4469,9 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, size_t n, size_t k, size_t m, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, std::vector const &depends = {}) { const lhsTy *lhs_tp = @@ -4514,12 +4508,12 @@ sycl::event gemm_batch_contig_impl(sycl::queue &exec_q, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); if (m < 4) { constexpr size_t m_groups = 1; size_t delta_k(4); @@ -4726,8 +4720,7 @@ class GemmBatchNoAtomicFunctorThreadNM const size_t gr_id = it.get_group_linear_id() - m_id * n_groups_per_batch; - const auto &three_offsets_ = - batch_indexer(static_cast(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); // lift group_id to (block_i, block_j, block_s), // 0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s @@ -4911,8 +4904,7 @@ class GemmBatchNoAtomicFunctorThreadNM(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); // lift group_id to (block_i, block_j, block_s), // 0 <= block_i < n_blocks, 0 <= block_j < m_blocks, 0 <= block_s @@ -5074,8 +5066,7 @@ class GemmBatchNoAtomicFunctorThreadK it.get_group_linear_id() - m_id * n_groups_per_batch; size_t lid = it.get_local_linear_id(); - const auto &three_offsets_ = - batch_indexer(static_cast(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); const auto &lhs_offset = three_offsets_.get_first_offset(); const auto &rhs_offset = three_offsets_.get_second_offset(); const auto &res_offset = three_offsets_.get_third_offset(); @@ -5231,8 +5222,7 @@ class GemmBatchNoAtomicFunctorThreadK(m_id)); + const auto &three_offsets_ = batch_indexer(static_cast(m_id)); const auto &lhs_offset = three_offsets_.get_first_offset(); const auto &rhs_offset = three_offsets_.get_second_offset(); const auto &res_offset = three_offsets_.get_third_offset(); @@ -5329,18 +5319,18 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, size_t k, size_t m, int batch_nd, - const py::ssize_t *batch_shape_strides, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_outer_nd, - const py::ssize_t *res_outer_shapes_strides, - const py::ssize_t *res_shape_strides, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, std::vector const &depends) { size_t delta_k(4); @@ -5488,7 +5478,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + 0, static_cast(batch_nelems), n * m); BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -5601,7 +5591,7 @@ gemm_batch_tree_k_impl(sycl::queue &exec_q, batch_shape_strides + 2 * batch_nd); Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + 0, static_cast(batch_nelems), n * m); BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -5692,18 +5682,18 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, size_t k, size_t m, int batch_nd, - const py::ssize_t *batch_shape_strides, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, int inner_nd, int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, + const ssize_t *lhs_outer_inner_shapes_strides, int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, + const ssize_t *rhs_outer_inner_shapes_strides, int res_outer_nd, - const py::ssize_t *res_outer_shapes_strides, - const py::ssize_t *res_shape_strides, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, std::vector const &depends) { constexpr int wi_delta_n = 2; @@ -5865,7 +5855,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + 0, static_cast(batch_nelems), n * m); BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -5988,7 +5978,7 @@ gemm_batch_tree_nm_impl(sycl::queue &exec_q, batch_nd, rhs_batch_offset, batch_shape_strides, batch_shape_strides + 2 * batch_nd); Strided1DIndexer tmp_batch_indexer( - 0, static_cast(batch_nelems), n * m); + 0, static_cast(batch_nelems), n * m); BatchDimsIndexerT batch_indexer( lhs_batch_indexer, rhs_batch_indexer, tmp_batch_indexer); @@ -6082,29 +6072,28 @@ template class gemm_batch_tree_empty_krn; template -sycl::event -gemm_batch_tree_impl(sycl::queue &exec_q, - const char *lhs_cp, - const char *rhs_cp, - char *res_cp, - size_t batch_nelems, - size_t n, - size_t k, - size_t m, - int batch_nd, - const py::ssize_t *batch_shape_strides, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, - int inner_nd, - int lhs_outer_nd, - const py::ssize_t *lhs_outer_inner_shapes_strides, - int rhs_outer_nd, - const py::ssize_t *rhs_outer_inner_shapes_strides, - int res_outer_nd, - const py::ssize_t *res_outer_shapes_strides, - const py::ssize_t *res_shape_strides, - std::vector const &depends = {}) +sycl::event gemm_batch_tree_impl(sycl::queue &exec_q, + const char *lhs_cp, + const char *rhs_cp, + char *res_cp, + size_t batch_nelems, + size_t n, + size_t k, + size_t m, + int batch_nd, + const ssize_t *batch_shape_strides, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, + int inner_nd, + int lhs_outer_nd, + const ssize_t *lhs_outer_inner_shapes_strides, + int rhs_outer_nd, + const ssize_t *rhs_outer_inner_shapes_strides, + int res_outer_nd, + const ssize_t *res_outer_shapes_strides, + const ssize_t *res_shape_strides, + std::vector const &depends = {}) { const lhsTy *lhs_tp = reinterpret_cast(lhs_cp); const rhsTy *rhs_tp = reinterpret_cast(rhs_cp); @@ -6228,12 +6217,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, using dpctl::tensor::offset_utils::Strided1DIndexer; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t n_blocks = (n + delta_n - 1) / delta_n; size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); @@ -6337,12 +6326,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, Strided1DIndexer, Strided1DIndexer, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t n_blocks = (n + delta_n - 1) / delta_n; size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); @@ -6443,12 +6432,12 @@ gemm_batch_contig_tree_k_impl(sycl::queue &exec_q, Strided1DIndexer, Strided1DIndexer, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t n_blocks = (n + delta_n - 1) / delta_n; size_t k_blocks = (k + n_wi * delta_k - 1) / (n_wi * delta_k); @@ -6573,12 +6562,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t lws = wg_delta_n * wg_delta_m; @@ -6691,12 +6680,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer, Strided1DIndexer, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t lws = wg_delta_n * wg_delta_m; @@ -6807,12 +6796,12 @@ gemm_batch_contig_tree_nm_impl(sycl::queue &exec_q, Strided1DIndexer, Strided1DIndexer, Strided1DIndexer>; BatchDimsIndexerT batch_indexer( - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * k)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(k * m)}, - Strided1DIndexer{0, static_cast(batch_nelems), - static_cast(n * m)}); + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * k)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(k * m)}, + Strided1DIndexer{0, static_cast(batch_nelems), + static_cast(n * m)}); size_t lws = wg_delta_n * wg_delta_m; @@ -6911,9 +6900,9 @@ gemm_batch_contig_tree_impl(sycl::queue &exec_q, size_t n, size_t k, size_t m, - py::ssize_t lhs_batch_offset, - py::ssize_t rhs_batch_offset, - py::ssize_t res_batch_offset, + ssize_t lhs_batch_offset, + ssize_t rhs_batch_offset, + ssize_t res_batch_offset, std::vector const &depends = {}) { const lhsTy *lhs_tp = diff --git a/dpctl/tensor/libtensor/include/kernels/reductions.hpp b/dpctl/tensor/libtensor/include/kernels/reductions.hpp index a8d9cf1972..50babfdbe0 100644 --- a/dpctl/tensor/libtensor/include/kernels/reductions.hpp +++ b/dpctl/tensor/libtensor/include/kernels/reductions.hpp @@ -31,14 +31,13 @@ #include #include -#include "pybind11/pybind11.h" +#include "dpctl_tensor_types.hpp" #include "utils/math_utils.hpp" #include "utils/offset_utils.hpp" #include "utils/sycl_utils.hpp" -#include "utils/type_dispatch.hpp" +#include "utils/type_dispatch_building.hpp" #include "utils/type_utils.hpp" -namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; namespace su_ns = dpctl::tensor::sycl_utils; @@ -98,17 +97,15 @@ struct SequentialReduction { auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); - const py::ssize_t &inp_iter_offset = + const ssize_t &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const py::ssize_t &out_iter_offset = + const ssize_t &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); outT red_val(identity_); for (size_t m = 0; m < reduction_max_gid_; ++m) { - const py::ssize_t inp_reduction_offset = - inp_reduced_dims_indexer_(m); - const py::ssize_t inp_offset = - inp_iter_offset + inp_reduction_offset; + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; using dpctl::tensor::type_utils::convert_impl; outT val = convert_impl(inp_[inp_offset]); @@ -334,12 +331,12 @@ typedef sycl::event (*reduction_strided_impl_fn_ptr)( const char *, char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, int, - const py::ssize_t *, - py::ssize_t, + const ssize_t *, + ssize_t, const std::vector &); template @@ -396,12 +393,12 @@ sycl::event reduction_over_group_with_atomics_strided_impl( const char *arg_cp, char *res_cp, int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp); @@ -445,8 +442,8 @@ sycl::event reduction_over_group_with_atomics_strided_impl( using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; - const py::ssize_t *const &res_shape = iter_shape_and_strides; - const py::ssize_t *const &res_strides = + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = iter_shape_and_strides + 2 * iter_nd; IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); @@ -536,9 +533,9 @@ typedef sycl::event (*reduction_contig_impl_fn_ptr)( size_t, const char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); /* @brief Reduce rows in a matrix */ @@ -551,9 +548,9 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -579,8 +576,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -610,9 +607,8 @@ sycl::event reduction_axis1_over_group_with_atomics_contig_impl( RowsIndexerT, NoOpIndexerT>; using ReductionIndexerT = NoOpIndexerT; - RowsIndexerT rows_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_nelems)}; + RowsIndexerT rows_indexer{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}; NoOpIndexerT result_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, result_indexer}; @@ -680,9 +676,9 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( // number of rows) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -709,8 +705,8 @@ sycl::event reduction_axis0_over_group_with_atomics_contig_impl( InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; using KernelName = class reduction_seq_contig_krn(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; constexpr size_t preferred_reductions_per_wi = 8; size_t reductions_per_wi = @@ -989,12 +985,12 @@ typedef sycl::event (*reduction_strided_impl_fn_ptr)( const char *, char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, int, - const py::ssize_t *, - py::ssize_t, + const ssize_t *, + ssize_t, const std::vector &); template @@ -1109,12 +1105,12 @@ sycl::event reduction_over_group_temps_strided_impl( const char *arg_cp, char *res_cp, int iter_nd, - const py::ssize_t *iter_shape_and_strides, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, + const ssize_t *iter_shape_and_strides, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, int red_nd, - const py::ssize_t *reduction_shape_stride, - py::ssize_t reduction_arg_offset, + const ssize_t *reduction_shape_stride, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp); @@ -1127,8 +1123,8 @@ sycl::event reduction_over_group_temps_strided_impl( using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; - const py::ssize_t *const &res_shape = iter_shape_and_strides; - const py::ssize_t *const &res_strides = + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = iter_shape_and_strides + 2 * iter_nd; IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); @@ -1369,8 +1365,8 @@ sycl::event reduction_over_group_temps_strided_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1433,8 +1429,8 @@ sycl::event reduction_over_group_temps_strided_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, /* shape */ iter_shape_and_strides, /* strides */ iter_shape_and_strides + @@ -1517,9 +1513,9 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -1552,8 +1548,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -1592,8 +1588,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -1684,9 +1680,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( RowsIndexerT, NoOpIndexerT>; using ReductionIndexerT = NoOpIndexerT; - RowsIndexerT rows_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_nelems)}; + RowsIndexerT rows_indexer{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}; NoOpIndexerT noop_tmp_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{rows_indexer, noop_tmp_indexer}; @@ -1758,8 +1753,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1821,8 +1816,8 @@ sycl::event reduction_axis1_over_group_temps_contig_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -1902,9 +1897,9 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -1938,8 +1933,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; using KernelName = class reduction_seq_contig_krn(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; if (iter_nelems == 1) { // increase GPU occupancy @@ -2079,8 +2074,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, noop_tmp_indexer}; ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; @@ -2148,8 +2143,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -2211,8 +2206,8 @@ sycl::event reduction_axis0_over_group_temps_contig_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -3522,18 +3517,16 @@ struct SequentialSearchReduction { auto const &inp_out_iter_offsets_ = inp_out_iter_indexer_(id[0]); - const py::ssize_t &inp_iter_offset = + const ssize_t &inp_iter_offset = inp_out_iter_offsets_.get_first_offset(); - const py::ssize_t &out_iter_offset = + const ssize_t &out_iter_offset = inp_out_iter_offsets_.get_second_offset(); argT red_val(identity_); outT idx_val(idx_identity_); for (size_t m = 0; m < reduction_max_gid_; ++m) { - const py::ssize_t inp_reduction_offset = - inp_reduced_dims_indexer_(m); - const py::ssize_t inp_offset = - inp_iter_offset + inp_reduction_offset; + const ssize_t inp_reduction_offset = inp_reduced_dims_indexer_(m); + const ssize_t inp_offset = inp_iter_offset + inp_reduction_offset; argT val = inp_[inp_offset]; if (val == red_val) { @@ -3997,12 +3990,12 @@ typedef sycl::event (*search_strided_impl_fn_ptr)( const char *, char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, int, - const py::ssize_t *, - py::ssize_t, + const ssize_t *, + ssize_t, const std::vector &); template &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp); @@ -4244,8 +4237,8 @@ sycl::event search_over_group_temps_strided_impl( using IndexerT = dpctl::tensor::offset_utils::UnpackedStridedIndexer; - const py::ssize_t *const &res_shape = iter_shape_and_strides; - const py::ssize_t *const &res_strides = + const ssize_t *const &res_shape = iter_shape_and_strides; + const ssize_t *const &res_strides = iter_shape_and_strides + 2 * iter_nd; IndexerT res_indexer(iter_nd, iter_res_offset, res_shape, res_strides); @@ -4506,8 +4499,8 @@ sycl::event search_over_group_temps_strided_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -4577,8 +4570,8 @@ sycl::event search_over_group_temps_strided_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{iter_nd, iter_res_offset, /* shape */ iter_shape_and_strides, /* strides */ iter_shape_and_strides + @@ -4664,9 +4657,9 @@ typedef sycl::event (*search_contig_impl_fn_ptr)( size_t, const char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -4717,8 +4710,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -4759,8 +4752,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -4865,8 +4858,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = NoOpIndexerT; InputOutputIterIndexerT in_out_iter_indexer{ - InputIterIndexerT{0, static_cast(iter_nelems), - static_cast(reduction_nelems)}, + InputIterIndexerT{0, static_cast(iter_nelems), + static_cast(reduction_nelems)}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{}; @@ -4943,8 +4936,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -5013,8 +5006,8 @@ sycl::event search_axis1_over_group_temps_contig_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -5103,9 +5096,9 @@ sycl::event search_axis0_over_group_temps_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t reduction_arg_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t reduction_arg_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -5140,8 +5133,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( InputOutputIterIndexerT in_out_iter_indexer{NoOpIndexerT{}, NoOpIndexerT{}}; ReductionIndexerT reduction_indexer{ - 0, static_cast(reduction_nelems), - static_cast(iter_nelems)}; + 0, static_cast(reduction_nelems), + static_cast(iter_nelems)}; using KernelName = class search_seq_contig_krn(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; if (iter_nelems == 1) { // increase GPU occupancy @@ -5295,8 +5288,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( InputOutputIterIndexerT in_out_iter_indexer{columns_indexer, result_indexer}; ReductionIndexerT reduction_indexer{ - 0, /* size */ static_cast(reduction_nelems), - /* step */ static_cast(iter_nelems)}; + 0, /* size */ static_cast(reduction_nelems), + /* step */ static_cast(iter_nelems)}; auto globalRange = sycl::range<1>{iter_nelems * reduction_groups * wg}; @@ -5371,8 +5364,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(reduction_groups_)}; + 0, static_cast(iter_nelems), + static_cast(reduction_groups_)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, @@ -5441,8 +5434,8 @@ sycl::event search_axis0_over_group_temps_contig_impl( using ReductionIndexerT = dpctl::tensor::offset_utils::NoOpIndexer; InputIndexerT inp_indexer{ - 0, static_cast(iter_nelems), - static_cast(remaining_reduction_nelems)}; + 0, static_cast(iter_nelems), + static_cast(remaining_reduction_nelems)}; ResIndexerT res_iter_indexer{}; InputOutputIterIndexerT in_out_iter_indexer{inp_indexer, diff --git a/dpctl/tensor/libtensor/include/kernels/repeat.hpp b/dpctl/tensor/libtensor/include/kernels/repeat.hpp index 05b57a8cda..66601329ae 100644 --- a/dpctl/tensor/libtensor/include/kernels/repeat.hpp +++ b/dpctl/tensor/libtensor/include/kernels/repeat.hpp @@ -26,10 +26,10 @@ #include #include #include -#include #include #include +#include "dpctl_tensor_types.hpp" #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" @@ -42,7 +42,6 @@ namespace kernels namespace repeat { -namespace py = pybind11; using namespace dpctl::tensor::offset_utils; template &); template @@ -138,15 +137,15 @@ repeat_by_sequence_impl(sycl::queue &q, const char *reps_cp, const char *cumsum_cp, int orthog_nd, - const py::ssize_t *orthog_src_dst_shape_and_strides, - py::ssize_t src_offset, - py::ssize_t dst_offset, - py::ssize_t src_axis_shape, - py::ssize_t src_axis_stride, - py::ssize_t dst_axis_shape, - py::ssize_t dst_axis_stride, - py::ssize_t reps_shape, - py::ssize_t reps_stride, + const ssize_t *orthog_src_dst_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, + ssize_t reps_shape, + ssize_t reps_stride, const std::vector &depends) { sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { @@ -200,11 +199,11 @@ typedef sycl::event (*repeat_by_sequence_1d_fn_ptr_t)( const char *, const char *, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -215,11 +214,11 @@ sycl::event repeat_by_sequence_1d_impl(sycl::queue &q, const char *reps_cp, const char *cumsum_cp, int src_nd, - const py::ssize_t *src_shape_strides, - py::ssize_t dst_shape, - py::ssize_t dst_stride, - py::ssize_t reps_shape, - py::ssize_t reps_stride, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, + ssize_t reps_shape, + ssize_t reps_stride, const std::vector &depends) { sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { @@ -277,7 +276,7 @@ class RepeatScalarFunctor private: const T *src = nullptr; T *dst = nullptr; - const py::ssize_t reps = 1; + const ssize_t reps = 1; size_t dst_axis_nelems = 0; OrthogIndexer orthog_strider; SrcAxisIndexer src_axis_strider; @@ -286,7 +285,7 @@ class RepeatScalarFunctor public: RepeatScalarFunctor(const T *src_, T *dst_, - const py::ssize_t reps_, + const ssize_t reps_, size_t dst_axis_nelems_, OrthogIndexer orthog_strider_, SrcAxisIndexer src_axis_strider_, @@ -319,15 +318,15 @@ typedef sycl::event (*repeat_by_scalar_fn_ptr_t)( size_t, const char *, char *, - const py::ssize_t, + const ssize_t, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -336,15 +335,15 @@ sycl::event repeat_by_scalar_impl(sycl::queue &q, size_t dst_axis_nelems, const char *src_cp, char *dst_cp, - const py::ssize_t reps, + const ssize_t reps, int orthog_nd, - const py::ssize_t *orthog_shape_and_strides, - py::ssize_t src_offset, - py::ssize_t dst_offset, - py::ssize_t src_axis_shape, - py::ssize_t src_axis_stride, - py::ssize_t dst_axis_shape, - py::ssize_t dst_axis_stride, + const ssize_t *orthog_shape_and_strides, + ssize_t src_offset, + ssize_t dst_offset, + ssize_t src_axis_shape, + ssize_t src_axis_stride, + ssize_t dst_axis_shape, + ssize_t dst_axis_stride, const std::vector &depends) { sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { @@ -388,11 +387,11 @@ typedef sycl::event (*repeat_by_scalar_1d_fn_ptr_t)( size_t, const char *, char *, - const py::ssize_t, + const ssize_t, int, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, const std::vector &); template @@ -400,11 +399,11 @@ sycl::event repeat_by_scalar_1d_impl(sycl::queue &q, size_t dst_nelems, const char *src_cp, char *dst_cp, - const py::ssize_t reps, + const ssize_t reps, int src_nd, - const py::ssize_t *src_shape_strides, - py::ssize_t dst_shape, - py::ssize_t dst_stride, + const ssize_t *src_shape_strides, + ssize_t dst_shape, + ssize_t dst_stride, const std::vector &depends) { sycl::event repeat_ev = q.submit([&](sycl::handler &cgh) { diff --git a/dpctl/tensor/libtensor/include/kernels/sorting.hpp b/dpctl/tensor/libtensor/include/kernels/sorting.hpp index e577a1a52a..2e8f4f8e91 100644 --- a/dpctl/tensor/libtensor/include/kernels/sorting.hpp +++ b/dpctl/tensor/libtensor/include/kernels/sorting.hpp @@ -24,8 +24,6 @@ #pragma once -#include "pybind11/pybind11.h" - #include #include #include @@ -33,6 +31,8 @@ #include #include +#include "dpctl_tensor_types.hpp" + namespace dpctl { namespace tensor @@ -750,10 +750,10 @@ typedef sycl::event (*sort_contig_fn_ptr_t)(sycl::queue &, size_t, const char *, char *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template > @@ -765,10 +765,10 @@ sycl::event stable_sort_axis1_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t sort_arg_offset, - py::ssize_t sort_res_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + @@ -837,10 +837,10 @@ sycl::event stable_argsort_axis1_contig_impl( // number of columns) const char *arg_cp, char *res_cp, - py::ssize_t iter_arg_offset, - py::ssize_t iter_res_offset, - py::ssize_t sort_arg_offset, - py::ssize_t sort_res_offset, + ssize_t iter_arg_offset, + ssize_t iter_res_offset, + ssize_t sort_arg_offset, + ssize_t sort_res_offset, const std::vector &depends) { const argTy *arg_tp = reinterpret_cast(arg_cp) + diff --git a/dpctl/tensor/libtensor/include/kernels/where.hpp b/dpctl/tensor/libtensor/include/kernels/where.hpp index a1c0c7cfb0..415dd8a8d5 100644 --- a/dpctl/tensor/libtensor/include/kernels/where.hpp +++ b/dpctl/tensor/libtensor/include/kernels/where.hpp @@ -23,15 +23,13 @@ //===----------------------------------------------------------------------===// #pragma once -#include "pybind11/numpy.h" -#include "pybind11/stl.h" #include #include #include -#include #include #include +#include "dpctl_tensor_types.hpp" #include "kernels/alignment.hpp" #include "utils/offset_utils.hpp" #include "utils/type_utils.hpp" @@ -45,8 +43,6 @@ namespace kernels namespace search { -namespace py = pybind11; - using namespace dpctl::tensor::offset_utils; using dpctl::tensor::kernels::alignment_utils:: @@ -244,7 +240,7 @@ class WhereStridedFunctor void operator()(sycl::id<1> id) const { size_t gid = id[0]; - auto offsets = indexer(static_cast(gid)); + auto offsets = indexer(static_cast(gid)); using dpctl::tensor::type_utils::convert_impl; bool check = @@ -264,11 +260,11 @@ typedef sycl::event (*where_strided_impl_fn_ptr_t)( const char *, const char *, char *, - const py::ssize_t *, - py::ssize_t, - py::ssize_t, - py::ssize_t, - py::ssize_t, + const ssize_t *, + ssize_t, + ssize_t, + ssize_t, + ssize_t, const std::vector &); template @@ -279,11 +275,11 @@ sycl::event where_strided_impl(sycl::queue &q, const char *x1_cp, const char *x2_cp, char *dst_cp, - const py::ssize_t *shape_strides, - py::ssize_t x1_offset, - py::ssize_t x2_offset, - py::ssize_t cond_offset, - py::ssize_t dst_offset, + const ssize_t *shape_strides, + ssize_t x1_offset, + ssize_t x2_offset, + ssize_t cond_offset, + ssize_t dst_offset, const std::vector &depends) { const condT *cond_tp = reinterpret_cast(cond_cp); diff --git a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp index 440d0d9d0b..c94b89e9a3 100644 --- a/dpctl/tensor/libtensor/include/utils/offset_utils.hpp +++ b/dpctl/tensor/libtensor/include/utils/offset_utils.hpp @@ -27,15 +27,13 @@ #pragma once #include -#include #include #include #include +#include "kernels/dpctl_tensor_types.hpp" #include "utils/strided_iters.hpp" -namespace py = pybind11; - namespace dpctl { namespace tensor @@ -85,7 +83,7 @@ std::vector concat(std::vector lhs, Vs &&...vs) template std::tuple -device_allocate_and_pack(sycl::queue q, +device_allocate_and_pack(sycl::queue &q, std::vector &host_task_events, Vs &&...vs) { @@ -137,35 +135,35 @@ struct NoOpIndexer struct StridedIndexer { StridedIndexer(int _nd, - py::ssize_t _offset, - py::ssize_t const *_packed_shape_strides) + ssize_t _offset, + ssize_t const *_packed_shape_strides) : nd(_nd), starting_offset(_offset), shape_strides(_packed_shape_strides) { } - py::ssize_t operator()(py::ssize_t gid) const + ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } - py::ssize_t operator()(size_t gid) const + ssize_t operator()(size_t gid) const { - return compute_offset(static_cast(gid)); + return compute_offset(static_cast(gid)); } private: int nd; - py::ssize_t starting_offset; - py::ssize_t const *shape_strides; + ssize_t starting_offset; + ssize_t const *shape_strides; - py::ssize_t compute_offset(py::ssize_t gid) const + ssize_t compute_offset(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd); - py::ssize_t relative_offset(0); - _ind.get_displacement( + ssize_t relative_offset(0); + _ind.get_displacement( gid, shape_strides, // shape ptr shape_strides + nd, // strides ptr @@ -178,36 +176,36 @@ struct StridedIndexer struct UnpackedStridedIndexer { UnpackedStridedIndexer(int _nd, - py::ssize_t _offset, - py::ssize_t const *_shape, - py::ssize_t const *_strides) + ssize_t _offset, + ssize_t const *_shape, + ssize_t const *_strides) : nd(_nd), starting_offset(_offset), shape(_shape), strides(_strides) { } - py::ssize_t operator()(py::ssize_t gid) const + ssize_t operator()(ssize_t gid) const { return compute_offset(gid); } - py::ssize_t operator()(size_t gid) const + ssize_t operator()(size_t gid) const { - return compute_offset(static_cast(gid)); + return compute_offset(static_cast(gid)); } private: int nd; - py::ssize_t starting_offset; - py::ssize_t const *shape; - py::ssize_t const *strides; + ssize_t starting_offset; + ssize_t const *shape; + ssize_t const *strides; - py::ssize_t compute_offset(py::ssize_t gid) const + ssize_t compute_offset(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd); py::ssize_t relative_offset(0); - _ind.get_displacement( + _ind.get_displacement( gid, shape, // shape ptr strides, // strides ptr @@ -218,41 +216,39 @@ struct UnpackedStridedIndexer struct Strided1DIndexer { - Strided1DIndexer(py::ssize_t _offset, py::ssize_t _size, py::ssize_t _step) + Strided1DIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) : offset(_offset), size(static_cast(_size)), step(_step) { } - py::ssize_t operator()(size_t gid) const + ssize_t operator()(size_t gid) const { // ensure 0 <= gid < size return offset + std::min(gid, size - 1) * step; } private: - py::ssize_t offset = 0; + ssize_t offset = 0; size_t size = 1; - py::ssize_t step = 1; + ssize_t step = 1; }; struct Strided1DCyclicIndexer { - Strided1DCyclicIndexer(py::ssize_t _offset, - py::ssize_t _size, - py::ssize_t _step) + Strided1DCyclicIndexer(ssize_t _offset, ssize_t _size, ssize_t _step) : offset(_offset), size(static_cast(_size)), step(_step) { } - py::ssize_t operator()(size_t gid) const + ssize_t operator()(size_t gid) const { return offset + (gid % size) * step; } private: - py::ssize_t offset = 0; + ssize_t offset = 0; size_t size = 1; - py::ssize_t step = 1; + ssize_t step = 1; }; template struct TwoOffsets @@ -281,45 +277,45 @@ template struct TwoOffsets struct TwoOffsets_StridedIndexer { TwoOffsets_StridedIndexer(int common_nd, - py::ssize_t first_offset_, - py::ssize_t second_offset_, - py::ssize_t const *_packed_shape_strides) + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t const *_packed_shape_strides) : nd(common_nd), starting_first_offset(first_offset_), starting_second_offset(second_offset_), shape_strides(_packed_shape_strides) { } - TwoOffsets operator()(py::ssize_t gid) const + TwoOffsets operator()(ssize_t gid) const { return compute_offsets(gid); } - TwoOffsets operator()(size_t gid) const + TwoOffsets operator()(size_t gid) const { - return compute_offsets(static_cast(gid)); + return compute_offsets(static_cast(gid)); } private: int nd; - py::ssize_t starting_first_offset; - py::ssize_t starting_second_offset; - py::ssize_t const *shape_strides; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t const *shape_strides; - TwoOffsets compute_offsets(py::ssize_t gid) const + TwoOffsets compute_offsets(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd); - py::ssize_t relative_first_offset(0); - py::ssize_t relative_second_offset(0); - _ind.get_displacement( + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + _ind.get_displacement( gid, shape_strides, // shape ptr shape_strides + nd, // strides ptr shape_strides + 2 * nd, // strides ptr relative_first_offset, relative_second_offset); - return TwoOffsets( + return TwoOffsets( starting_first_offset + relative_first_offset, starting_second_offset + relative_second_offset); } @@ -329,9 +325,9 @@ struct TwoZeroOffsets_Indexer { TwoZeroOffsets_Indexer() {} - TwoOffsets operator()(py::ssize_t) const + TwoOffsets operator()(ssize_t) const { - return TwoOffsets(); + return TwoOffsets(); } }; @@ -389,10 +385,10 @@ template struct ThreeOffsets struct ThreeOffsets_StridedIndexer { ThreeOffsets_StridedIndexer(int common_nd, - py::ssize_t first_offset_, - py::ssize_t second_offset_, - py::ssize_t third_offset_, - py::ssize_t const *_packed_shape_strides) + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t third_offset_, + ssize_t const *_packed_shape_strides) : nd(common_nd), starting_first_offset(first_offset_), starting_second_offset(second_offset_), starting_third_offset(third_offset_), @@ -400,32 +396,32 @@ struct ThreeOffsets_StridedIndexer { } - ThreeOffsets operator()(py::ssize_t gid) const + ThreeOffsets operator()(ssize_t gid) const { return compute_offsets(gid); } - ThreeOffsets operator()(size_t gid) const + ThreeOffsets operator()(size_t gid) const { - return compute_offsets(static_cast(gid)); + return compute_offsets(static_cast(gid)); } private: int nd; - py::ssize_t starting_first_offset; - py::ssize_t starting_second_offset; - py::ssize_t starting_third_offset; - py::ssize_t const *shape_strides; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t starting_third_offset; + ssize_t const *shape_strides; - ThreeOffsets compute_offsets(py::ssize_t gid) const + ThreeOffsets compute_offsets(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd); - py::ssize_t relative_first_offset(0); - py::ssize_t relative_second_offset(0); - py::ssize_t relative_third_offset(0); - _ind.get_displacement( + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + ssize_t relative_third_offset(0); + _ind.get_displacement( gid, shape_strides, // shape ptr shape_strides + nd, // strides ptr @@ -433,7 +429,7 @@ struct ThreeOffsets_StridedIndexer shape_strides + 3 * nd, // strides ptr relative_first_offset, relative_second_offset, relative_third_offset); - return ThreeOffsets( + return ThreeOffsets( starting_first_offset + relative_first_offset, starting_second_offset + relative_second_offset, starting_third_offset + relative_third_offset); @@ -469,10 +465,10 @@ struct ThreeOffsets_CombinedIndexer { } - ThreeOffsets operator()(py::ssize_t gid) const + ThreeOffsets operator()(ssize_t gid) const { - return ThreeOffsets( - first_indexer_(gid), second_indexer_(gid), third_indexer_(gid)); + return ThreeOffsets(first_indexer_(gid), second_indexer_(gid), + third_indexer_(gid)); } }; @@ -518,11 +514,11 @@ template struct FourOffsets struct FourOffsets_StridedIndexer { FourOffsets_StridedIndexer(int common_nd, - py::ssize_t first_offset_, - py::ssize_t second_offset_, - py::ssize_t third_offset_, - py::ssize_t fourth_offset_, - py::ssize_t const *_packed_shape_strides) + ssize_t first_offset_, + ssize_t second_offset_, + ssize_t third_offset_, + ssize_t fourth_offset_, + ssize_t const *_packed_shape_strides) : nd(common_nd), starting_first_offset(first_offset_), starting_second_offset(second_offset_), starting_third_offset(third_offset_), @@ -531,34 +527,34 @@ struct FourOffsets_StridedIndexer { } - FourOffsets operator()(py::ssize_t gid) const + FourOffsets operator()(ssize_t gid) const { return compute_offsets(gid); } - FourOffsets operator()(size_t gid) const + FourOffsets operator()(size_t gid) const { - return compute_offsets(static_cast(gid)); + return compute_offsets(static_cast(gid)); } private: int nd; - py::ssize_t starting_first_offset; - py::ssize_t starting_second_offset; - py::ssize_t starting_third_offset; - py::ssize_t starting_fourth_offset; - py::ssize_t const *shape_strides; + ssize_t starting_first_offset; + ssize_t starting_second_offset; + ssize_t starting_third_offset; + ssize_t starting_fourth_offset; + ssize_t const *shape_strides; - FourOffsets compute_offsets(py::ssize_t gid) const + FourOffsets compute_offsets(ssize_t gid) const { using dpctl::tensor::strides::CIndexer_vector; CIndexer_vector _ind(nd); - py::ssize_t relative_first_offset(0); - py::ssize_t relative_second_offset(0); - py::ssize_t relative_third_offset(0); - py::ssize_t relative_fourth_offset(0); - _ind.get_displacement( + ssize_t relative_first_offset(0); + ssize_t relative_second_offset(0); + ssize_t relative_third_offset(0); + ssize_t relative_fourth_offset(0); + _ind.get_displacement( gid, shape_strides, // shape ptr shape_strides + nd, // strides ptr @@ -567,7 +563,7 @@ struct FourOffsets_StridedIndexer shape_strides + 4 * nd, // strides ptr relative_first_offset, relative_second_offset, relative_third_offset, relative_fourth_offset); - return FourOffsets( + return FourOffsets( starting_first_offset + relative_first_offset, starting_second_offset + relative_second_offset, starting_third_offset + relative_third_offset, @@ -579,26 +575,26 @@ struct FourZeroOffsets_Indexer { FourZeroOffsets_Indexer() {} - FourOffsets operator()(py::ssize_t) const + FourOffsets operator()(ssize_t) const { - return FourOffsets(); + return FourOffsets(); } }; struct NthStrideOffset { NthStrideOffset(int common_nd, - py::ssize_t const *_offsets, - py::ssize_t const *_packed_shape_strides) + ssize_t const *_offsets, + ssize_t const *_packed_shape_strides) : _ind(common_nd), nd(common_nd), offsets(_offsets), shape_strides(_packed_shape_strides) { } - size_t operator()(py::ssize_t gid, int n) const + size_t operator()(ssize_t gid, int n) const { - py::ssize_t relative_offset(0); - _ind.get_displacement( + ssize_t relative_offset(0); + _ind.get_displacement( gid, shape_strides, shape_strides + ((n + 1) * nd), relative_offset); @@ -606,29 +602,29 @@ struct NthStrideOffset } private: - dpctl::tensor::strides::CIndexer_vector _ind; + dpctl::tensor::strides::CIndexer_vector _ind; int nd; - py::ssize_t const *offsets; - py::ssize_t const *shape_strides; + ssize_t const *offsets; + ssize_t const *shape_strides; }; template struct FixedDimStridedIndexer { - FixedDimStridedIndexer(const std::array _shape, - const std::array _strides, - py::ssize_t _offset) + FixedDimStridedIndexer(const std::array _shape, + const std::array _strides, + ssize_t _offset) : _ind(_shape), strides(_strides), starting_offset(_offset) { } size_t operator()(size_t gid) const { - dpctl::tensor::strides::CIndexer_array local_indexer( + dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); local_indexer.set(gid); auto mi = local_indexer.get(); - py::ssize_t relative_offset = 0; + ssize_t relative_offset = 0; #pragma unroll for (int i = 0; i < nd; ++i) { @@ -638,112 +634,110 @@ template struct FixedDimStridedIndexer } private: - dpctl::tensor::strides::CIndexer_array _ind; + dpctl::tensor::strides::CIndexer_array _ind; - const std::array strides; - py::ssize_t starting_offset; + const std::array strides; + ssize_t starting_offset; }; template struct TwoOffsets_FixedDimStridedIndexer { - TwoOffsets_FixedDimStridedIndexer( - const std::array _shape, - const std::array _strides1, - const std::array _strides2, - py::ssize_t _offset1, - py::ssize_t _offset2) + TwoOffsets_FixedDimStridedIndexer(const std::array _shape, + const std::array _strides1, + const std::array _strides2, + ssize_t _offset1, + ssize_t _offset2) : _ind(_shape), strides1(_strides1), strides2(_strides2), starting_offset1(_offset1), starting_offset2(_offset2) { } - TwoOffsets operator()(size_t gid) const + TwoOffsets operator()(size_t gid) const { - dpctl::tensor::strides::CIndexer_array local_indexer( + dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); local_indexer.set(gid); auto mi = local_indexer.get(); - py::ssize_t relative_offset1 = 0; + ssize_t relative_offset1 = 0; #pragma unroll for (int i = 0; i < nd; ++i) { relative_offset1 += mi[i] * strides1[i]; } - py::ssize_t relative_offset2 = 0; + ssize_t relative_offset2 = 0; #pragma unroll for (int i = 0; i < nd; ++i) { relative_offset2 += mi[i] * strides2[i]; } - return TwoOffsets(starting_offset1 + relative_offset1, - starting_offset2 + relative_offset2); + return TwoOffsets(starting_offset1 + relative_offset1, + starting_offset2 + relative_offset2); } private: - dpctl::tensor::strides::CIndexer_array _ind; + dpctl::tensor::strides::CIndexer_array _ind; - const std::array strides1; - const std::array strides2; - py::ssize_t starting_offset1; - py::ssize_t starting_offset2; + const std::array strides1; + const std::array strides2; + ssize_t starting_offset1; + ssize_t starting_offset2; }; template struct ThreeOffsets_FixedDimStridedIndexer { - ThreeOffsets_FixedDimStridedIndexer( - const std::array _shape, - const std::array _strides1, - const std::array _strides2, - const std::array _strides3, - py::ssize_t _offset1, - py::ssize_t _offset2, - py::ssize_t _offset3) + ThreeOffsets_FixedDimStridedIndexer(const std::array _shape, + const std::array _strides1, + const std::array _strides2, + const std::array _strides3, + ssize_t _offset1, + ssize_t _offset2, + ssize_t _offset3) : _ind(_shape), strides1(_strides1), strides2(_strides2), strides3(_strides3), starting_offset1(_offset1), starting_offset2(_offset2), starting_offset3(_offset3) { } - ThreeOffsets operator()(size_t gid) const + ThreeOffsets operator()(size_t gid) const { - dpctl::tensor::strides::CIndexer_array local_indexer( + dpctl::tensor::strides::CIndexer_array local_indexer( std::move(_ind)); local_indexer.set(gid); auto mi = local_indexer.get(); - py::ssize_t relative_offset1 = 0; + ssize_t relative_offset1 = 0; #pragma unroll for (int i = 0; i < nd; ++i) { relative_offset1 += mi[i] * strides1[i]; } - py::ssize_t relative_offset2 = 0; + ssize_t relative_offset2 = 0; #pragma unroll for (int i = 0; i < nd; ++i) { relative_offset2 += mi[i] * strides2[i]; } - py::ssize_t relative_offset3 = 0; + ssize_t relative_offset3 = 0; #pragma unroll for (int i = 0; i < nd; ++i) { relative_offset3 += mi[i] * strides3[i]; } - return ThreeOffsets(starting_offset1 + relative_offset1, - starting_offset2 + relative_offset2, - starting_offset3 + relative_offset3); + return ThreeOffsets(starting_offset1 + relative_offset1, + starting_offset2 + relative_offset2, + starting_offset3 + relative_offset3); } private: - dpctl::tensor::strides::CIndexer_array _ind; - - const std::array strides1; - const std::array strides2; - const std::array strides3; - py::ssize_t starting_offset1; - py::ssize_t starting_offset2; - py::ssize_t starting_offset3; + dpctl::tensor::strides::CIndexer_array _ind; + + const std::array strides1; + const std::array strides2; + const std::array strides3; + ssize_t starting_offset1; + ssize_t starting_offset2; + ssize_t starting_offset3; }; } // namespace offset_utils diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp index af031a963b..252192f507 100644 --- a/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch.hpp @@ -25,8 +25,7 @@ #pragma once #include "dpctl4pybind11.hpp" -#include -#include +#include "type_dispatch_building.hpp" namespace dpctl { @@ -36,129 +35,6 @@ namespace tensor namespace type_dispatch { -enum class typenum_t : int -{ - BOOL = 0, - INT8, // 1 - UINT8, - INT16, - UINT16, - INT32, // 5 - UINT32, - INT64, - UINT64, - HALF, - FLOAT, // 10 - DOUBLE, - CFLOAT, - CDOUBLE, // 13 -}; -constexpr int num_types = 14; // number of elements in typenum_t - -template - typename factory, - int _num_types> -class DispatchTableBuilder -{ -private: - template - const std::vector row_per_dst_type() const - { - std::vector per_dstTy = { - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory{}.get(), - factory>{}.get(), - factory>{}.get()}; - assert(per_dstTy.size() == _num_types); - return per_dstTy; - } - -public: - DispatchTableBuilder() = default; - ~DispatchTableBuilder() = default; - - void populate_dispatch_table(funcPtrT table[][_num_types]) const - { - const auto map_by_dst_type = {row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type(), - row_per_dst_type>(), - row_per_dst_type>()}; - assert(map_by_dst_type.size() == _num_types); - int dst_id = 0; - for (auto &row : map_by_dst_type) { - int src_id = 0; - for (auto &fn_ptr : row) { - table[dst_id][src_id] = fn_ptr; - ++src_id; - } - ++dst_id; - } - } -}; - -template - typename factory, - int _num_types> -class DispatchVectorBuilder -{ -private: - template const funcPtrT func_per_type() const - { - funcPtrT f = factory{}.get(); - return f; - } - -public: - DispatchVectorBuilder() = default; - ~DispatchVectorBuilder() = default; - - void populate_dispatch_vector(funcPtrT vector[]) const - { - const auto fn_map_by_type = {func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type(), - func_per_type>(), - func_per_type>()}; - assert(fn_map_by_type.size() == _num_types); - int ty_id = 0; - for (auto &fn : fn_map_by_type) { - vector[ty_id] = fn; - ++ty_id; - } - } -}; - struct usm_ndarray_types { @@ -250,136 +126,6 @@ struct usm_ndarray_types } }; -/*! @brief struct to define result_type typename for Ty == ArgTy */ -template -struct TypeMapResultEntry : std::bool_constant> -{ - using result_type = ResTy; -}; - -/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 == - * ArgTy2 */ -template -struct BinaryTypeMapResultEntry - : std::bool_constant, - std::is_same>> -{ - using result_type = ResTy; -}; - -/*! @brief fall-through struct with specified result_type, usually void */ -template struct DefaultResultEntry : std::true_type -{ - using result_type = Ty; -}; - -/*! @brief Utility struct to convert C++ type into typeid integer */ -template struct GetTypeid -{ - int get() - { - if constexpr (std::is_same_v) { - return static_cast(typenum_t::BOOL); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::INT8); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::UINT8); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::INT16); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::UINT16); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::INT32); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::UINT32); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::INT64); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::UINT64); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::HALF); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::FLOAT); - } - else if constexpr (std::is_same_v) { - return static_cast(typenum_t::DOUBLE); - } - else if constexpr (std::is_same_v>) { - return static_cast(typenum_t::CFLOAT); - } - else if constexpr (std::is_same_v>) { - return static_cast(typenum_t::CDOUBLE); - } - else if constexpr (std::is_same_v) { // special token - return -1; - } - - assert(("Unsupported type T", false)); - return -2; - } -}; - -/*! @brief Class to generate vector of null function pointers */ -template struct NullPtrVector -{ - - using value_type = FunPtrT; - using const_reference = value_type const &; - - NullPtrVector() : val(nullptr) {} - - const_reference operator[](int) const - { - return val; - } - -private: - value_type val; -}; - -/*! @brief Class to generate table of null function pointers */ -template struct NullPtrTable -{ - using value_type = NullPtrVector; - using const_reference = value_type const &; - - NullPtrTable() : val() {} - - const_reference operator[](int) const - { - return val; - } - -private: - value_type val; -}; - -template -struct TypePairDefinedEntry : std::bool_constant && - std::is_same_v> -{ - static constexpr bool is_defined = true; -}; - -struct NotDefinedEntry : std::true_type -{ - static constexpr bool is_defined = false; -}; - } // namespace type_dispatch } // namespace tensor diff --git a/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp new file mode 100644 index 0000000000..11cccdfb56 --- /dev/null +++ b/dpctl/tensor/libtensor/include/utils/type_dispatch_building.hpp @@ -0,0 +1,294 @@ +//===--type_dispatch.cpp - Type-dispatch table building utils ----*-C++-*- ===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2022 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===----------------------------------------------------------------------===// +/// +/// \file +/// This file defines class to implement dispatch tables for pair of types +//===----------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace dpctl +{ +namespace tensor +{ + +namespace type_dispatch +{ + +enum class typenum_t : int +{ + BOOL = 0, + INT8, // 1 + UINT8, + INT16, + UINT16, + INT32, // 5 + UINT32, + INT64, + UINT64, + HALF, + FLOAT, // 10 + DOUBLE, + CFLOAT, + CDOUBLE, // 13 +}; +constexpr int num_types = 14; // number of elements in typenum_t + +template + typename factory, + int _num_types> +class DispatchTableBuilder +{ +private: + template + const std::vector row_per_dst_type() const + { + std::vector per_dstTy = { + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory{}.get(), + factory>{}.get(), + factory>{}.get()}; + assert(per_dstTy.size() == _num_types); + return per_dstTy; + } + +public: + DispatchTableBuilder() = default; + ~DispatchTableBuilder() = default; + + void populate_dispatch_table(funcPtrT table[][_num_types]) const + { + const auto map_by_dst_type = {row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type(), + row_per_dst_type>(), + row_per_dst_type>()}; + assert(map_by_dst_type.size() == _num_types); + int dst_id = 0; + for (auto &row : map_by_dst_type) { + int src_id = 0; + for (auto &fn_ptr : row) { + table[dst_id][src_id] = fn_ptr; + ++src_id; + } + ++dst_id; + } + } +}; + +template + typename factory, + int _num_types> +class DispatchVectorBuilder +{ +private: + template const funcPtrT func_per_type() const + { + funcPtrT f = factory{}.get(); + return f; + } + +public: + DispatchVectorBuilder() = default; + ~DispatchVectorBuilder() = default; + + void populate_dispatch_vector(funcPtrT vector[]) const + { + const auto fn_map_by_type = {func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type(), + func_per_type>(), + func_per_type>()}; + assert(fn_map_by_type.size() == _num_types); + int ty_id = 0; + for (auto &fn : fn_map_by_type) { + vector[ty_id] = fn; + ++ty_id; + } + } +}; + +/*! @brief struct to define result_type typename for Ty == ArgTy */ +template +struct TypeMapResultEntry : std::bool_constant> +{ + using result_type = ResTy; +}; + +/*! @brief struct to define result_type typename for Ty1 == ArgTy1 && Ty2 == + * ArgTy2 */ +template +struct BinaryTypeMapResultEntry + : std::bool_constant, + std::is_same>> +{ + using result_type = ResTy; +}; + +/*! @brief fall-through struct with specified result_type, usually void */ +template struct DefaultResultEntry : std::true_type +{ + using result_type = Ty; +}; + +/*! @brief Utility struct to convert C++ type into typeid integer */ +template struct GetTypeid +{ + int get() + { + if constexpr (std::is_same_v) { + return static_cast(typenum_t::BOOL); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT8); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT16); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT32); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::INT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::UINT64); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::HALF); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::FLOAT); + } + else if constexpr (std::is_same_v) { + return static_cast(typenum_t::DOUBLE); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CFLOAT); + } + else if constexpr (std::is_same_v>) { + return static_cast(typenum_t::CDOUBLE); + } + else if constexpr (std::is_same_v) { // special token + return -1; + } + + assert(("Unsupported type T", false)); + return -2; + } +}; + +/*! @brief Class to generate vector of null function pointers */ +template struct NullPtrVector +{ + + using value_type = FunPtrT; + using const_reference = value_type const &; + + NullPtrVector() : val(nullptr) {} + + const_reference operator[](int) const + { + return val; + } + +private: + value_type val; +}; + +/*! @brief Class to generate table of null function pointers */ +template struct NullPtrTable +{ + using value_type = NullPtrVector; + using const_reference = value_type const &; + + NullPtrTable() : val() {} + + const_reference operator[](int) const + { + return val; + } + +private: + value_type val; +}; + +template +struct TypePairDefinedEntry : std::bool_constant && + std::is_same_v> +{ + static constexpr bool is_defined = true; +}; + +struct NotDefinedEntry : std::true_type +{ + static constexpr bool is_defined = false; +}; + +} // namespace type_dispatch + +} // namespace tensor +} // namespace dpctl diff --git a/dpctl/tensor/libtensor/source/boolean_reductions.cpp b/dpctl/tensor/libtensor/source/boolean_reductions.cpp index 32deab6da9..b7ca433367 100644 --- a/dpctl/tensor/libtensor/source/boolean_reductions.cpp +++ b/dpctl/tensor/libtensor/source/boolean_reductions.cpp @@ -37,10 +37,13 @@ #include "dpctl4pybind11.hpp" #include "kernels/boolean_reductions.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "utils/type_utils.hpp" namespace py = pybind11; +static_assert(std::is_same_v); + namespace dpctl { namespace tensor diff --git a/dpctl/tensor/libtensor/source/clip.cpp b/dpctl/tensor/libtensor/source/clip.cpp index ac494c19ae..96af65771d 100644 --- a/dpctl/tensor/libtensor/source/clip.cpp +++ b/dpctl/tensor/libtensor/source/clip.cpp @@ -24,12 +24,12 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include #include #include +#include #include #include "clip.hpp" diff --git a/dpctl/tensor/libtensor/source/clip.hpp b/dpctl/tensor/libtensor/source/clip.hpp index d4b8af2cf5..e8ed1e83fb 100644 --- a/dpctl/tensor/libtensor/source/clip.hpp +++ b/dpctl/tensor/libtensor/source/clip.hpp @@ -24,7 +24,7 @@ //===----------------------------------------------------------------------===// #pragma once -#include +#include #include #include diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp index 4b3e8b635b..fd65860690 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/abs.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "abs.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp index 011cc052fb..38ddeba9b4 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acos.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "acos.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp index 526bd44f12..48a1036528 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/acosh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "acosh.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp index 247b8e0283..e4ca013223 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/add.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "add.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp index 166b37b27b..ab1d9ed866 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/angle.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "angle.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp index 14ef5e2665..1659ff6c30 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asin.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "asin.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp index dd0b4e62f7..f07ecc7c74 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/asinh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "asinh.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp index 81ff00c46a..a06ee1278a 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "atan.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp index d12a4ff540..49ec146ace 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atan2.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "atan2.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp index c42769b8d0..d97d78f79e 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/atanh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "atanh.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp index f86f5112cd..ec227b8b0d 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_and.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_and.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp index 29a04cff38..a9015b213f 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_invert.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_invert.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp index 7969bc4ffa..e3056e4dbf 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_left_shift.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_left_shift.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp index 33a57f907c..81f976f862 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_or.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_or.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp index 3847204b1f..a0671256ce 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_right_shift.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_right_shift.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp index 71d606766f..efe1e9bda5 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/bitwise_xor.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "bitwise_xor.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp index b42f234c0d..6841023c45 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cbrt.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "cbrt.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp index f1bb362c5b..eeb4666959 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/ceil.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "ceil.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp index cac84e63fb..a520f2ce1f 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/conj.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "conj.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp index 6a887e0345..d02438f1fe 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/copysign.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "copysign.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp index 1986610510..d8d1958f62 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cos.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "cos.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp index 0bb74df979..6525ad54fe 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/cosh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "cosh.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp index da0137fd5f..673af04b77 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions.hpp @@ -34,6 +34,7 @@ #include "elementwise_functions_type_utils.hpp" #include "kernels/alignment.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "simplify_iteration_space.hpp" #include "utils/memory_overlap.hpp" #include "utils/offset_utils.hpp" @@ -42,6 +43,8 @@ namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; +static_assert(std::is_same_v); + namespace dpctl { namespace tensor diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp index 473048e8fa..44b83497e8 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.cpp @@ -24,9 +24,9 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include +#include #include "elementwise_functions_type_utils.hpp" #include "utils/type_dispatch.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp index 6dac195dc2..7f1cacdc20 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/elementwise_functions_type_utils.hpp @@ -25,9 +25,9 @@ #pragma once #include "dpctl4pybind11.hpp" -#include #include #include +#include #include "utils/type_dispatch.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp index f36ec1b446..a650d5d8fd 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/equal.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp index 51ccaaac70..f0c6ec9a62 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp index 438ad0800e..a59f193644 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/exp2.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp index 3b9332c4f1..26c11a926b 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/expm1.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp index 9ccf89f13a..c538cd7668 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp index e75fc56c67..4797198483 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/floor_divide.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp index f79102df47..87589a88f9 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp index 005679c3fb..bb46ceb0ec 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/greater_equal.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp index 2442710198..b14f23ea7c 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/hypot.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp index 4012b9206f..270504a199 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/imag.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp index 73a2be4010..6da365d5e0 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isfinite.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp index 2600fe4f74..1c19a3587d 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isinf.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp index b75618c5e0..e2b224bd5e 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/isnan.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp index c34122d862..1326b9741f 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp index 712b30d902..f402ad88ad 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/less_equal.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp index f73b9e2414..5258f56158 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp index 566dfcbcf7..d6a2815cd1 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log10.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp index badb474778..961e56e319 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log1p.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp index b5a8a39684..c307246ecc 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/log2.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp index 77ded230be..2dd585ab4e 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logaddexp.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp index 4c573ce508..f2680cbaff 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_and.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp index 84362cd9ce..de9a48320b 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_not.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp index ebf8251b2e..15eb40d0f2 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_or.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp index 9488a5615a..fd1853b927 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/logical_xor.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp index 208bdcf47f..edcee5ded7 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/maximum.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp index dc1a826ac4..ff0ee9ce9c 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/minimum.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp index c087abd9ff..0058dadcfc 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/multiply.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp index bc659506d1..c10dfa0fc1 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/negative.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp index a7a3e909cb..ba9fd3bc78 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/not_equal.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp index eaff0794d2..99cf8b821d 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/positive.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp index a8ef6cb171..a6cacba4ef 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/pow.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp index 60060084e1..25a062785b 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/proj.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp index 890a308a4e..2c63606fb6 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/real.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp index 5f86188c99..5c717142d8 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/reciprocal.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp index 3255ea7e7f..e2b0e38061 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/remainder.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp index cce730b899..510e1d53ca 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/round.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp index 4661fdfa48..d5df041ee4 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/rsqrt.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp index 7b7c2c22e5..352cc0e4e4 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sign.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp index fc101dd64b..b8b917cbdd 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/signbit.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp index 415dc15133..487bcfc0dd 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sin.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp index d9f92eb8f1..49064284ce 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sinh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp index 159d45b51c..db04b01298 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/sqrt.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp index 184e09c19c..968262a7b0 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/square.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp index 9703182e7a..c720ab23a3 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/subtract.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp index 2f1fbf55f2..c5dc7f4625 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tan.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp index 033389e46d..398bac4097 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/tanh.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp index 22ad9bf3cb..9894ffa5a7 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/true_divide.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp index 5b2f451fb0..6f86b3e19c 100644 --- a/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp +++ b/dpctl/tensor/libtensor/source/elementwise_functions/trunc.cpp @@ -24,10 +24,10 @@ //===----------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "elementwise_functions.hpp" diff --git a/dpctl/tensor/libtensor/source/full_ctor.cpp b/dpctl/tensor/libtensor/source/full_ctor.cpp index c8004bfae8..94d0a13100 100644 --- a/dpctl/tensor/libtensor/source/full_ctor.cpp +++ b/dpctl/tensor/libtensor/source/full_ctor.cpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "full_ctor.hpp" +#include "unboxing_helper.hpp" namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; @@ -48,7 +49,60 @@ namespace py_internal using dpctl::utils::keep_args_alive; -using dpctl::tensor::kernels::constructors::full_contig_fn_ptr_t; +typedef sycl::event (*full_contig_fn_ptr_t)(sycl::queue &, + size_t, + const py::object &, + char *, + const std::vector &); + +/*! + * @brief Function to submit kernel to fill given contiguous memory allocation + * with specified value. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param py_value Python object representing the value to fill the array with. + * Must be convertible to `dstTy`. + * @param dst_p Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event full_contig_impl(sycl::queue &exec_q, + size_t nelems, + const py::object &py_value, + char *dst_p, + const std::vector &depends) +{ + dstTy fill_v; + + PythonObjectUnboxer unboxer{}; + try { + fill_v = unboxer(py_value); + } catch (const py::error_already_set &e) { + throw; + } + + using dpctl::tensor::kernels::constructors::full_contig_impl; + + sycl::event fill_ev = + full_contig_impl(exec_q, nelems, fill_v, dst_p, depends); + + return fill_ev; +} + +template struct FullContigFactory +{ + fnT get() + { + fnT f = full_contig_impl; + return f; + } +}; static full_contig_fn_ptr_t full_contig_dispatch_vector[td_ns::num_types]; @@ -99,7 +153,6 @@ usm_ndarray_full(const py::object &py_value, void init_full_ctor_dispatch_vectors(void) { using namespace td_ns; - using dpctl::tensor::kernels::constructors::FullContigFactory; DispatchVectorBuilder dvb; diff --git a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp index 926f5ffad6..9a2b51497e 100644 --- a/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp +++ b/dpctl/tensor/libtensor/source/linalg_functions/dot.cpp @@ -1,9 +1,9 @@ #include "dpctl4pybind11.hpp" -#include #include #include #include #include +#include #include #include diff --git a/dpctl/tensor/libtensor/source/linear_sequences.cpp b/dpctl/tensor/libtensor/source/linear_sequences.cpp index 72d292df5f..e3f804b43a 100644 --- a/dpctl/tensor/libtensor/source/linear_sequences.cpp +++ b/dpctl/tensor/libtensor/source/linear_sequences.cpp @@ -35,6 +35,7 @@ #include "utils/type_utils.hpp" #include "linear_sequences.hpp" +#include "unboxing_helper.hpp" namespace py = pybind11; namespace td_ns = dpctl::tensor::type_dispatch; @@ -46,13 +47,121 @@ namespace tensor namespace py_internal { -using dpctl::utils::keep_args_alive; +// Constructor to populate tensor with linear sequence defined by +// start and step data + +typedef sycl::event (*lin_space_step_fn_ptr_t)( + sycl::queue &, + size_t, // num_elements + const py::object &start, + const py::object &step, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting value and increment + * given as Python objects. + * + * @param q Sycl queue to which the kernel is submitted + * @param nelems Length of the sequence + * @param start Starting value of the sequence as Python object. Must be + * convertible to array element data type `Ty`. + * @param step Increment of the sequence as Python object. Must be convertible + * to array element data type `Ty`. + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_step_impl(sycl::queue &exec_q, + size_t nelems, + const py::object &start, + const py::object &step, + char *array_data, + const std::vector &depends) +{ + Ty start_v; + Ty step_v; + + const auto &unboxer = PythonObjectUnboxer{}; + try { + start_v = unboxer(start); + step_v = unboxer(step); + } catch (const py::error_already_set &e) { + throw; + } -using dpctl::tensor::kernels::constructors::lin_space_step_fn_ptr_t; + using dpctl::tensor::kernels::constructors::lin_space_step_impl; -static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; + auto lin_space_step_event = lin_space_step_impl( + exec_q, nelems, start_v, step_v, array_data, depends); + + return lin_space_step_event; +} + +typedef sycl::event (*lin_space_affine_fn_ptr_t)( + sycl::queue &, + size_t, // num_elements + const py::object &start, + const py::object &end, + bool include_endpoint, + char *, // dst_data_ptr + const std::vector &); + +/*! + * @brief Function to submit kernel to populate given contiguous memory + * allocation with linear sequence specified by starting and end values given + * as Python objects. + * + * @param exec_q Sycl queue to which kernel is submitted for execution. + * @param nelems Length of the sequence + * @param start Stating value of the sequence as Python object. Must be + * convertible to array data element type `Ty`. + * @param end End-value of the sequence as Python object. Must be convertible + * to array data element type `Ty`. + * @param include_endpoint Whether the end-value is included in the sequence + * @param array_data Kernel accessible USM pointer to the start of array to be + * populated. + * @param depends List of events to wait for before starting computations, if + * any. + * + * @return Event to wait on to ensure that computation completes. + * @defgroup CtorKernels + */ +template +sycl::event lin_space_affine_impl(sycl::queue &exec_q, + size_t nelems, + const py::object &start, + const py::object &end, + bool include_endpoint, + char *array_data, + const std::vector &depends) +{ + Ty start_v, end_v; + const auto &unboxer = PythonObjectUnboxer{}; + try { + start_v = unboxer(start); + end_v = unboxer(end); + } catch (const py::error_already_set &e) { + throw; + } + + using dpctl::tensor::kernels::constructors::lin_space_affine_impl; + + auto lin_space_affine_event = lin_space_affine_impl( + exec_q, nelems, start_v, end_v, include_endpoint, array_data, depends); -using dpctl::tensor::kernels::constructors::lin_space_affine_fn_ptr_t; + return lin_space_affine_event; +} + +using dpctl::utils::keep_args_alive; + +static lin_space_step_fn_ptr_t lin_space_step_dispatch_vector[td_ns::num_types]; static lin_space_affine_fn_ptr_t lin_space_affine_dispatch_vector[td_ns::num_types]; @@ -153,11 +262,36 @@ usm_ndarray_linear_sequence_affine(const py::object &start, linspace_affine_event); } +/*! + * @brief Factor to get function pointer of type `fnT` for array with elements + * of type `Ty`. + * @defgroup CtorKernels + */ +template struct LinSpaceStepFactory +{ + fnT get() + { + fnT f = lin_space_step_impl; + return f; + } +}; + +/*! + * @brief Factory to get function pointer of type `fnT` for array data type + * `Ty`. + */ +template struct LinSpaceAffineFactory +{ + fnT get() + { + fnT f = lin_space_affine_impl; + return f; + } +}; + void init_linear_sequences_dispatch_vectors(void) { using namespace td_ns; - using dpctl::tensor::kernels::constructors::LinSpaceAffineFactory; - using dpctl::tensor::kernels::constructors::LinSpaceStepFactory; DispatchVectorBuilder diff --git a/dpctl/tensor/libtensor/source/reductions/argmax.cpp b/dpctl/tensor/libtensor/source/reductions/argmax.cpp index 1d83bf9c2d..d3e2460081 100644 --- a/dpctl/tensor/libtensor/source/reductions/argmax.cpp +++ b/dpctl/tensor/libtensor/source/reductions/argmax.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/argmin.cpp b/dpctl/tensor/libtensor/source/reductions/argmin.cpp index c6469e6864..57d0a9ccd2 100644 --- a/dpctl/tensor/libtensor/source/reductions/argmin.cpp +++ b/dpctl/tensor/libtensor/source/reductions/argmin.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp index e3b015a4e0..4936edd17f 100644 --- a/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp +++ b/dpctl/tensor/libtensor/source/reductions/logsumexp.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/max.cpp b/dpctl/tensor/libtensor/source/reductions/max.cpp index 32c60b943b..0166857039 100644 --- a/dpctl/tensor/libtensor/source/reductions/max.cpp +++ b/dpctl/tensor/libtensor/source/reductions/max.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/min.cpp b/dpctl/tensor/libtensor/source/reductions/min.cpp index de1a81387d..f36cff2bcf 100644 --- a/dpctl/tensor/libtensor/source/reductions/min.cpp +++ b/dpctl/tensor/libtensor/source/reductions/min.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/prod.cpp b/dpctl/tensor/libtensor/source/reductions/prod.cpp index a90d04304a..66c8bb35be 100644 --- a/dpctl/tensor/libtensor/source/reductions/prod.cpp +++ b/dpctl/tensor/libtensor/source/reductions/prod.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp index c7313930b4..e7e80cc680 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp +++ b/dpctl/tensor/libtensor/source/reductions/reduce_hypot.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp index 2478545efe..a6d7a274fb 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_atomic_support.hpp @@ -23,8 +23,8 @@ //===--------------------------------------------------------------------===// #pragma once -#include #include +#include #include #include "utils/type_utils.hpp" diff --git a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp index 5aafe38a40..e6da120821 100644 --- a/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp +++ b/dpctl/tensor/libtensor/source/reductions/reduction_over_axis.hpp @@ -25,9 +25,9 @@ #pragma once -#include #include #include +#include #include #include #include diff --git a/dpctl/tensor/libtensor/source/reductions/sum.cpp b/dpctl/tensor/libtensor/source/reductions/sum.cpp index 33803cfd7b..81130e9abd 100644 --- a/dpctl/tensor/libtensor/source/reductions/sum.cpp +++ b/dpctl/tensor/libtensor/source/reductions/sum.cpp @@ -23,10 +23,10 @@ //===--------------------------------------------------------------------===// #include "dpctl4pybind11.hpp" -#include #include #include #include +#include #include #include "kernels/reductions.hpp" diff --git a/dpctl/tensor/libtensor/source/tensor_ctors.cpp b/dpctl/tensor/libtensor/source/tensor_ctors.cpp index be2b20c18d..0f1f7a81fc 100644 --- a/dpctl/tensor/libtensor/source/tensor_ctors.cpp +++ b/dpctl/tensor/libtensor/source/tensor_ctors.cpp @@ -46,6 +46,7 @@ #include "eye_ctor.hpp" #include "full_ctor.hpp" #include "integer_advanced_indexing.hpp" +#include "kernels/dpctl_tensor_types.hpp" #include "linear_sequences.hpp" #include "repeat.hpp" #include "simplify_iteration_space.hpp" @@ -56,6 +57,8 @@ namespace py = pybind11; +static_assert(std::is_same_v); + namespace { diff --git a/dpctl/tensor/libtensor/source/unboxing_helper.hpp b/dpctl/tensor/libtensor/source/unboxing_helper.hpp new file mode 100644 index 0000000000..d7082c3e13 --- /dev/null +++ b/dpctl/tensor/libtensor/source/unboxing_helper.hpp @@ -0,0 +1,53 @@ +//===-- ------------ Implementation of _tensor_impl module ----*-C++-*-/===// +// +// Data Parallel Control (dpctl) +// +// Copyright 2020-2023 Intel Corporation +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +//===--------------------------------------------------------------------===// +/// +/// \file +/// This file defines functions of dpctl.tensor._tensor_impl extensions +//===--------------------------------------------------------------------===// + +#pragma once + +#include +#include + +namespace dpctl +{ +namespace tensor +{ +namespace py_internal +{ + +template struct PythonObjectUnboxer +{ + T operator()(const py::object &o) const + { + if constexpr (std::is_same_v) { + float tmp = py::cast(o); + return static_cast(tmp); + } + else { + return py::cast(o); + } + } +}; + +} // namespace py_internal +} // namespace tensor +} // namespace dpctl