From 57541deadf9fc8037a121cd65b72875e10158a3c Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Tue, 25 Apr 2023 22:23:56 +0800 Subject: [PATCH 1/4] Optimize contiguous array interface input. Type cast the array once instead of performing type casts for every individual element. --- src/data/array_interface.h | 74 ++++++++++++++++++++++++++++++++++++-- src/data/data.cc | 6 ++-- 2 files changed, 75 insertions(+), 5 deletions(-) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index e9045899b8dd..bccf88220ade 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -572,19 +572,87 @@ class ArrayInterface { // Used only by columnar format. RBitField8 valid; // Array stride - size_t strides[D]{0}; + std::size_t strides[D]{0}; // Array shape - size_t shape[D]{0}; + std::size_t shape[D]{0}; // Type earsed pointer referencing the data. void const *data{nullptr}; // Total number of items - size_t n{0}; + std::size_t n{0}; // Whether the memory is c-contiguous bool is_contiguous{false}; // RTTI, initialized to the f16 to avoid masking potential bugs in initialization. ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16}; }; +template +void ElementWiseOp(ArrayInterface const array, std::int32_t device, Fn fn) { + // Only used for cuDF at the moment. + CHECK_EQ(array.valid.Size(), 0); + auto dispatch = [&](auto t) { + using T = decltype(t); + linalg::TensorView const, D> tensor{ + common::Span{static_cast(array.data), array.n}, array.shape, + array.strides, device}; + fn(tensor); + }; + switch (array.type) { + case ArrayInterfaceHandler::kF2: { +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 + dispatch(__half{}); +#endif + break; + } + case ArrayInterfaceHandler::kF4: { + dispatch(float{}); + break; + } + case ArrayInterfaceHandler::kF8: { + dispatch(double{}); + break; + } + case ArrayInterfaceHandler::kF16: { + using T = long double; + CHECK(sizeof(long double) == 16) + << "128-bit floating point is not supported on current platform."; + dispatch(T{}); + break; + } + case ArrayInterfaceHandler::kI1: { + dispatch(std::int8_t{}); + break; + } + case ArrayInterfaceHandler::kI2: { + dispatch(std::int16_t{}); + break; + } + case ArrayInterfaceHandler::kI4: { + dispatch(std::int32_t{}); + break; + } + case ArrayInterfaceHandler::kI8: { + dispatch(std::int64_t{}); + break; + } + case ArrayInterfaceHandler::kU1: { + dispatch(std::uint8_t{}); + break; + } + case ArrayInterfaceHandler::kU2: { + dispatch(std::uint16_t{}); + break; + } + case ArrayInterfaceHandler::kU4: { + dispatch(std::uint32_t{}); + break; + } + case ArrayInterfaceHandler::kU8: { + dispatch(std::uint64_t{}); + break; + } + } +} + /** * \brief Helper for type casting. */ diff --git a/src/data/data.cc b/src/data/data.cc index 694bc48b99d8..60c0effbf2d9 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -429,8 +429,10 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::TensorReshape(array.shape); auto t = p_out->View(Context::kCpuId); CHECK(t.CContiguous()); - linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) { - return linalg::detail::Apply(TypedIndex{array}, linalg::UnravelIndex(i, t.Shape())); + ElementWiseOp(array, Context::kCpuId, [&](auto in) { + linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) { + return linalg::detail::Apply(in, linalg::UnravelIndex(i, t.Shape())); + }); }); } } // namespace From 9742b5c6eae708d466007f9d3e54055ec3497a74 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 26 Apr 2023 02:33:07 +0800 Subject: [PATCH 2/4] Fix debug check. --- src/common/error_msg.h | 4 ++++ src/data/array_interface.h | 24 ++++++++++++++---------- src/data/data.cc | 6 +++--- 3 files changed, 21 insertions(+), 13 deletions(-) diff --git a/src/common/error_msg.h b/src/common/error_msg.h index 3dbb7f52c150..4415bf2ee81b 100644 --- a/src/common/error_msg.h +++ b/src/common/error_msg.h @@ -24,5 +24,9 @@ constexpr StringView LabelScoreSize() { constexpr StringView InfInData() { return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`"; } + +constexpr StringView NoF128() { + return "128-bit floating point is not supported on current platform."; +} } // namespace xgboost::error #endif // XGBOOST_COMMON_ERROR_MSG_H_ diff --git a/src/data/array_interface.h b/src/data/array_interface.h index bccf88220ade..358b9bddea9f 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -7,7 +7,7 @@ #define XGBOOST_DATA_ARRAY_INTERFACE_H_ #include -#include // std::size_t +#include // for size_t #include #include #include @@ -17,6 +17,7 @@ #include "../common/bitfield.h" #include "../common/common.h" +#include "../common/error_msg.h" // for NoF128 #include "xgboost/base.h" #include "xgboost/data.h" #include "xgboost/json.h" @@ -454,9 +455,8 @@ class ArrayInterface { void AssignType(StringView typestr) { using T = ArrayInterfaceHandler::Type; if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' && typestr[3] == '6') { + CHECK(sizeof(long double) == 16) << error::NoF128(); type = T::kF16; - CHECK(sizeof(long double) == 16) - << "128-bit floating point is not supported on current platform."; } else if (typestr[1] == 'f' && typestr[2] == '2') { #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600 type = T::kF2; @@ -586,14 +586,19 @@ class ArrayInterface { }; template -void ElementWiseOp(ArrayInterface const array, std::int32_t device, Fn fn) { +void DispatchDType(ArrayInterface const array, std::int32_t device, Fn fn) { // Only used for cuDF at the moment. CHECK_EQ(array.valid.Size(), 0); auto dispatch = [&](auto t) { - using T = decltype(t); - linalg::TensorView const, D> tensor{ - common::Span{static_cast(array.data), array.n}, array.shape, - array.strides, device}; + using T = std::remove_const_t const; + // Set the data size to max as we don't know the original size of a sliced array: + // + // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results + // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original + // size 24 based on the slice. + linalg::TensorView tensor{common::Span{static_cast(array.data), + std::numeric_limits::max()}, + array.shape, array.strides, device}; fn(tensor); }; switch (array.type) { @@ -613,8 +618,7 @@ void ElementWiseOp(ArrayInterface const array, std::int32_t device, Fn fn) { } case ArrayInterfaceHandler::kF16: { using T = long double; - CHECK(sizeof(long double) == 16) - << "128-bit floating point is not supported on current platform."; + CHECK(sizeof(long double) == 16) << error::NoF128(); dispatch(T{}); break; } diff --git a/src/data/data.cc b/src/data/data.cc index 60c0effbf2d9..9d842cfa3311 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -429,9 +429,9 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::TensorReshape(array.shape); auto t = p_out->View(Context::kCpuId); CHECK(t.CContiguous()); - ElementWiseOp(array, Context::kCpuId, [&](auto in) { - linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) { - return linalg::detail::Apply(in, linalg::UnravelIndex(i, t.Shape())); + DispatchDType(array, Context::kCpuId, [&](auto in) { + linalg::ElementWiseTransformHost(t, 1, [&](auto i, auto) { + return std::apply(in, linalg::UnravelIndex(i, t.Shape())); }); }); } From 6e4fd644737b378f4baec539271fc929e7625ea6 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 26 Apr 2023 02:40:55 +0800 Subject: [PATCH 3/4] merge. --- src/data/array_interface.h | 7 +++---- src/data/data.cc | 11 ++++++----- 2 files changed, 9 insertions(+), 9 deletions(-) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index 358b9bddea9f..ce166d09a033 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -596,10 +596,9 @@ void DispatchDType(ArrayInterface const array, std::int32_t device, Fn fn) { // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original // size 24 based on the slice. - linalg::TensorView tensor{common::Span{static_cast(array.data), - std::numeric_limits::max()}, - array.shape, array.strides, device}; - fn(tensor); + fn(linalg::TensorView{common::Span{static_cast(array.data), + std::numeric_limits::max()}, + array.shape, array.strides, device}); }; switch (array.type) { case ArrayInterfaceHandler::kF2: { diff --git a/src/data/data.cc b/src/data/data.cc index 9d842cfa3311..d611a400ec7a 100644 --- a/src/data/data.cc +++ b/src/data/data.cc @@ -427,11 +427,12 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::TensorReshape(array.shape); - auto t = p_out->View(Context::kCpuId); - CHECK(t.CContiguous()); - DispatchDType(array, Context::kCpuId, [&](auto in) { - linalg::ElementWiseTransformHost(t, 1, [&](auto i, auto) { - return std::apply(in, linalg::UnravelIndex(i, t.Shape())); + auto t_out = p_out->View(Context::kCpuId); + CHECK(t_out.CContiguous()); + auto const shape = t_out.Shape(); + DispatchDType(array, Context::kCpuId, [&](auto&& in) { + linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) { + return std::apply(in, linalg::UnravelIndex(i, shape)); }); }); } From aa0dd8b29ff9f3fe66eafbae4c9e59a75f8c36e3 Mon Sep 17 00:00:00 2001 From: Jiaming Yuan Date: Wed, 26 Apr 2023 02:47:34 +0800 Subject: [PATCH 4/4] cpplint. --- src/data/array_interface.h | 1 + 1 file changed, 1 insertion(+) diff --git a/src/data/array_interface.h b/src/data/array_interface.h index ce166d09a033..fee22203c111 100644 --- a/src/data/array_interface.h +++ b/src/data/array_interface.h @@ -9,6 +9,7 @@ #include #include // for size_t #include +#include // for numeric_limits #include #include #include // std::alignment_of,std::remove_pointer_t