Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Decouple pybind11 and kernels #1516

Merged
merged 3 commits into from
Jan 31, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 4 additions & 6 deletions dpctl/tensor/libtensor/include/kernels/accumulators.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,13 @@
#include <array>
#include <cstdint>
#include <limits>
#include <pybind11/pybind11.h>
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_dispatch.hpp"
#include "utils/type_dispatch_building.hpp"

namespace dpctl
{
Expand All @@ -43,8 +43,6 @@ namespace kernels
namespace accumulators
{

namespace py = pybind11;

using namespace dpctl::tensor::offset_utils;

template <typename T> T ceiling_quotient(T n, T m)
Expand Down Expand Up @@ -437,7 +435,7 @@ typedef size_t (*accumulate_strided_impl_fn_ptr_t)(
size_t,
const char *,
int,
const py::ssize_t *,
const ssize_t *,
char *,
std::vector<sycl::event> &,
const std::vector<sycl::event> &);
Expand All @@ -447,7 +445,7 @@ size_t accumulate_strided_impl(sycl::queue &q,
size_t n_elems,
const char *mask,
int nd,
const py::ssize_t *shape_strides,
const ssize_t *shape_strides,
char *cumsum,
std::vector<sycl::event> &host_tasks,
const std::vector<sycl::event> &depends = {})
Expand Down
138 changes: 68 additions & 70 deletions dpctl/tensor/libtensor/include/kernels/boolean_advanced_indexing.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,13 +25,13 @@
#pragma once
#include <cstdint>
#include <limits>
#include <pybind11/pybind11.h>
#include <sycl/sycl.hpp>
#include <utility>
#include <vector>

#include "dpctl_tensor_types.hpp"
#include "utils/offset_utils.hpp"
#include "utils/type_dispatch.hpp"
#include "utils/type_dispatch_building.hpp"

namespace dpctl
{
Expand All @@ -42,8 +42,6 @@ namespace kernels
namespace indexing
{

namespace py = pybind11;

using namespace dpctl::tensor::offset_utils;

template <typename OrthogIndexerT,
Expand Down Expand Up @@ -90,7 +88,7 @@ struct MaskedExtractStridedFunctor
// + 1 : 1)
if (mask_set) {
auto orthog_offsets =
orthog_src_dst_indexer(static_cast<py::ssize_t>(orthog_i));
orthog_src_dst_indexer(static_cast<ssize_t>(orthog_i));

size_t total_src_offset = masked_src_indexer(masked_i) +
orthog_offsets.get_first_offset();
Expand Down Expand Up @@ -161,7 +159,7 @@ struct MaskedPlaceStridedFunctor
// + 1 : 1)
if (mask_set) {
auto orthog_offsets =
orthog_dst_rhs_indexer(static_cast<py::ssize_t>(orthog_i));
orthog_dst_rhs_indexer(static_cast<ssize_t>(orthog_i));

size_t total_dst_offset = masked_dst_indexer(masked_i) +
orthog_offsets.get_first_offset();
Expand Down Expand Up @@ -199,28 +197,28 @@ class masked_extract_all_slices_strided_impl_krn;

typedef sycl::event (*masked_extract_all_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
py::ssize_t,
ssize_t,
const char *,
const char *,
char *,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
const std::vector<sycl::event> &);

template <typename dataT, typename indT>
sycl::event masked_extract_all_slices_strided_impl(
sycl::queue &exec_q,
py::ssize_t iteration_size,
ssize_t iteration_size,
const char *src_p,
const char *cumsum_p,
char *dst_p,
int nd,
const py::ssize_t
const ssize_t
*packed_src_shape_strides, // [src_shape, src_strides], length 2*nd
py::ssize_t dst_size, // dst is 1D
py::ssize_t dst_stride,
ssize_t dst_size, // dst is 1D
ssize_t dst_stride,
const std::vector<sycl::event> &depends = {})
{
// using MaskedExtractStridedFunctor;
Expand All @@ -230,7 +228,7 @@ sycl::event masked_extract_all_slices_strided_impl(

TwoZeroOffsets_Indexer orthog_src_dst_indexer{};

/* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
/* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_src_indexer(nd, 0, packed_src_shape_strides);
Strided1DIndexer masked_dst_indexer(0, dst_size, dst_stride);
Expand All @@ -254,19 +252,19 @@ sycl::event masked_extract_all_slices_strided_impl(

typedef sycl::event (*masked_extract_some_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
py::ssize_t,
py::ssize_t,
ssize_t,
ssize_t,
const char *,
const char *,
char *,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
const std::vector<sycl::event> &);

template <typename OrthoIndexerT,
Expand All @@ -279,24 +277,24 @@ class masked_extract_some_slices_strided_impl_krn;
template <typename dataT, typename indT>
sycl::event masked_extract_some_slices_strided_impl(
sycl::queue &exec_q,
py::ssize_t orthog_nelems,
py::ssize_t masked_nelems,
ssize_t orthog_nelems,
ssize_t masked_nelems,
const char *src_p,
const char *cumsum_p,
char *dst_p,
int orthog_nd,
const py::ssize_t
const ssize_t
*packed_ortho_src_dst_shape_strides, // [ortho_shape, ortho_src_strides,
// ortho_dst_strides], length
// 3*ortho_nd
py::ssize_t ortho_src_offset,
py::ssize_t ortho_dst_offset,
ssize_t ortho_src_offset,
ssize_t ortho_dst_offset,
int masked_nd,
const py::ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
// masked_src_strides],
// length 2*masked_nd
py::ssize_t masked_dst_size, // mask_dst is 1D
py::ssize_t masked_dst_stride,
const ssize_t *packed_masked_src_shape_strides, // [masked_src_shape,
// masked_src_strides],
// length 2*masked_nd
ssize_t masked_dst_size, // mask_dst is 1D
ssize_t masked_dst_stride,
const std::vector<sycl::event> &depends = {})
{
// using MaskedExtractStridedFunctor;
Expand Down Expand Up @@ -381,33 +379,33 @@ class masked_place_all_slices_strided_impl_krn;

typedef sycl::event (*masked_place_all_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
py::ssize_t,
ssize_t,
char *,
const char *,
const char *,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
const std::vector<sycl::event> &);

template <typename dataT, typename indT>
sycl::event masked_place_all_slices_strided_impl(
sycl::queue &exec_q,
py::ssize_t iteration_size,
ssize_t iteration_size,
char *dst_p,
const char *cumsum_p,
const char *rhs_p,
int nd,
const py::ssize_t
const ssize_t
*packed_dst_shape_strides, // [dst_shape, dst_strides], length 2*nd
py::ssize_t rhs_size, // rhs is 1D
py::ssize_t rhs_stride,
ssize_t rhs_size, // rhs is 1D
ssize_t rhs_stride,
const std::vector<sycl::event> &depends = {})
{
TwoZeroOffsets_Indexer orthog_dst_rhs_indexer{};

/* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
/* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_dst_indexer(nd, 0, packed_dst_shape_strides);
Strided1DCyclicIndexer masked_rhs_indexer(0, rhs_size, rhs_stride);
Expand All @@ -431,19 +429,19 @@ sycl::event masked_place_all_slices_strided_impl(

typedef sycl::event (*masked_place_some_slices_strided_impl_fn_ptr_t)(
sycl::queue &,
py::ssize_t,
py::ssize_t,
ssize_t,
ssize_t,
char *,
const char *,
const char *,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
int,
py::ssize_t const *,
py::ssize_t,
py::ssize_t,
ssize_t const *,
ssize_t,
ssize_t,
const std::vector<sycl::event> &);

template <typename OrthoIndexerT,
Expand All @@ -456,31 +454,31 @@ class masked_place_some_slices_strided_impl_krn;
template <typename dataT, typename indT>
sycl::event masked_place_some_slices_strided_impl(
sycl::queue &exec_q,
py::ssize_t orthog_nelems,
py::ssize_t masked_nelems,
ssize_t orthog_nelems,
ssize_t masked_nelems,
char *dst_p,
const char *cumsum_p,
const char *rhs_p,
int orthog_nd,
const py::ssize_t
const ssize_t
*packed_ortho_dst_rhs_shape_strides, // [ortho_shape, ortho_dst_strides,
// ortho_rhs_strides], length
// 3*ortho_nd
py::ssize_t ortho_dst_offset,
py::ssize_t ortho_rhs_offset,
ssize_t ortho_dst_offset,
ssize_t ortho_rhs_offset,
int masked_nd,
const py::ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
// masked_dst_strides],
// length 2*masked_nd
py::ssize_t masked_rhs_size, // mask_dst is 1D
py::ssize_t masked_rhs_stride,
const ssize_t *packed_masked_dst_shape_strides, // [masked_dst_shape,
// masked_dst_strides],
// length 2*masked_nd
ssize_t masked_rhs_size, // mask_dst is 1D
ssize_t masked_rhs_stride,
const std::vector<sycl::event> &depends = {})
{
TwoOffsets_StridedIndexer orthog_dst_rhs_indexer{
orthog_nd, ortho_dst_offset, ortho_rhs_offset,
packed_ortho_dst_rhs_shape_strides};

/* StridedIndexer(int _nd, py::ssize_t _offset, py::ssize_t const
/* StridedIndexer(int _nd, ssize_t _offset, ssize_t const
* *_packed_shape_strides) */
StridedIndexer masked_dst_indexer{masked_nd, 0,
packed_masked_dst_shape_strides};
Expand Down Expand Up @@ -550,22 +548,22 @@ template <typename T1, typename T2> class non_zero_indexes_krn;

typedef sycl::event (*non_zero_indexes_fn_ptr_t)(
sycl::queue &,
py::ssize_t,
py::ssize_t,
ssize_t,
ssize_t,
int,
const char *,
char *,
const py::ssize_t *,
const ssize_t *,
std::vector<sycl::event> const &);

template <typename indT1, typename indT2>
sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
py::ssize_t iter_size,
py::ssize_t nz_elems,
ssize_t iter_size,
ssize_t nz_elems,
int nd,
const char *cumsum_cp,
char *indexes_cp,
const py::ssize_t *mask_shape,
const ssize_t *mask_shape,
std::vector<sycl::event> const &depends)
{
const indT1 *cumsum_data = reinterpret_cast<const indT1 *>(cumsum_cp);
Expand All @@ -582,11 +580,11 @@ sycl::event non_zero_indexes_impl(sycl::queue &exec_q,
auto cs_prev_val = (i > 0) ? cumsum_data[i - 1] : indT1(0);
bool cond = (cs_curr_val == cs_prev_val);

py::ssize_t i_ = static_cast<py::ssize_t>(i);
ssize_t i_ = static_cast<ssize_t>(i);
for (int dim = nd; --dim > 0;) {
auto sd = mask_shape[dim];
py::ssize_t q = i_ / sd;
py::ssize_t r = (i_ - q * sd);
ssize_t q = i_ / sd;
ssize_t r = (i_ - q * sd);
if (cond) {
indexes_data[cs_curr_val + dim * nz_elems] =
static_cast<indT2>(r);
Expand Down
Loading
Loading