From 57541deadf9fc8037a121cd65b72875e10158a3c Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Tue, 25 Apr 2023 22:23:56 +0800
Subject: [PATCH 1/4] Optimize contiguous array interface input.

Type cast the array once instead of performing type casts for every individual element.
---
 src/data/array_interface.h | 74 ++++++++++++++++++++++++++++++++++++--
 src/data/data.cc           |  6 ++--
 2 files changed, 75 insertions(+), 5 deletions(-)
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index e9045899b8dd..bccf88220ade 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -572,19 +572,87 @@ class ArrayInterface {
   // Used only by columnar format.
   RBitField8 valid;
   // Array stride
-  size_t strides[D]{0};
+  std::size_t strides[D]{0};
   // Array shape
-  size_t shape[D]{0};
+  std::size_t shape[D]{0};
   // Type earsed pointer referencing the data.
   void const *data{nullptr};
   // Total number of items
-  size_t n{0};
+  std::size_t n{0};
   // Whether the memory is c-contiguous
   bool is_contiguous{false};
   // RTTI, initialized to the f16 to avoid masking potential bugs in initialization.
   ArrayInterfaceHandler::Type type{ArrayInterfaceHandler::kF16};
 };
 
+template <std::int32_t D, typename Fn>
+void ElementWiseOp(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+  // Only used for cuDF at the moment.
+  CHECK_EQ(array.valid.Size(), 0);
+  auto dispatch = [&](auto t) {
+    using T = decltype(t);
+    linalg::TensorView<std::remove_const_t<T> const, D> tensor{
+        common::Span<T const>{static_cast<T const *>(array.data), array.n}, array.shape,
+        array.strides, device};
+    fn(tensor);
+  };
+  switch (array.type) {
+    case ArrayInterfaceHandler::kF2: {
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
+      dispatch(__half{});
+#endif
+      break;
+    }
+    case ArrayInterfaceHandler::kF4: {
+      dispatch(float{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF8: {
+      dispatch(double{});
+      break;
+    }
+    case ArrayInterfaceHandler::kF16: {
+      using T = long double;
+      CHECK(sizeof(long double) == 16)
+          << "128-bit floating point is not supported on current platform.";
+      dispatch(T{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI1: {
+      dispatch(std::int8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI2: {
+      dispatch(std::int16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI4: {
+      dispatch(std::int32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kI8: {
+      dispatch(std::int64_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU1: {
+      dispatch(std::uint8_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU2: {
+      dispatch(std::uint16_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU4: {
+      dispatch(std::uint32_t{});
+      break;
+    }
+    case ArrayInterfaceHandler::kU8: {
+      dispatch(std::uint64_t{});
+      break;
+    }
+  }
+}
+
 /**
  * \brief Helper for type casting.
  */
diff --git a/src/data/data.cc b/src/data/data.cc
index 694bc48b99d8..60c0effbf2d9 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -429,8 +429,10 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
   p_out->Reshape(array.shape);
   auto t = p_out->View(Context::kCpuId);
   CHECK(t.CContiguous());
-  linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) {
-    return linalg::detail::Apply(TypedIndex<T, D>{array}, linalg::UnravelIndex<D>(i, t.Shape()));
+  ElementWiseOp(array, Context::kCpuId, [&](auto in) {
+    linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) {
+      return linalg::detail::Apply(in, linalg::UnravelIndex<D>(i, t.Shape()));
+    });
   });
 }
 }  // namespace

From 9742b5c6eae708d466007f9d3e54055ec3497a74 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 26 Apr 2023 02:33:07 +0800
Subject: [PATCH 2/4] Fix debug check.

---
 src/common/error_msg.h     |  4 ++++
 src/data/array_interface.h | 24 ++++++++++++++----------
 src/data/data.cc           |  6 +++---
 3 files changed, 21 insertions(+), 13 deletions(-)

diff --git a/src/common/error_msg.h b/src/common/error_msg.h
index 3dbb7f52c150..4415bf2ee81b 100644
--- a/src/common/error_msg.h
+++ b/src/common/error_msg.h
@@ -24,5 +24,9 @@ constexpr StringView LabelScoreSize() {
 constexpr StringView InfInData() {
   return "Input data contains `inf` or a value too large, while `missing` is not set to `inf`";
 }
+
+constexpr StringView NoF128() {
+  return "128-bit floating point is not supported on current platform.";
+}
 }  // namespace xgboost::error
 #endif  // XGBOOST_COMMON_ERROR_MSG_H_
diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index bccf88220ade..358b9bddea9f 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -7,7 +7,7 @@
 #define XGBOOST_DATA_ARRAY_INTERFACE_H_
 
 #include <algorithm>
-#include <cstddef>  // std::size_t
+#include <cstddef>  // for size_t
 #include <cstdint>
 #include <map>
 #include <string>
@@ -17,6 +17,7 @@
 
 #include "../common/bitfield.h"
 #include "../common/common.h"
+#include "../common/error_msg.h"  // for NoF128
 #include "xgboost/base.h"
 #include "xgboost/data.h"
 #include "xgboost/json.h"
@@ -454,9 +455,8 @@ class ArrayInterface {
   void AssignType(StringView typestr) {
     using T = ArrayInterfaceHandler::Type;
     if (typestr.size() == 4 && typestr[1] == 'f' && typestr[2] == '1' && typestr[3] == '6') {
+      CHECK(sizeof(long double) == 16) << error::NoF128();
       type = T::kF16;
-      CHECK(sizeof(long double) == 16)
-          << "128-bit floating point is not supported on current platform.";
     } else if (typestr[1] == 'f' && typestr[2] == '2') {
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 600
       type = T::kF2;
@@ -586,14 +586,19 @@ class ArrayInterface {
 };
 
 template <std::int32_t D, typename Fn>
-void ElementWiseOp(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
+void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
   // Only used for cuDF at the moment.
   CHECK_EQ(array.valid.Size(), 0);
   auto dispatch = [&](auto t) {
-    using T = decltype(t);
-    linalg::TensorView<std::remove_const_t<T> const, D> tensor{
-        common::Span<T const>{static_cast<T const *>(array.data), array.n}, array.shape,
-        array.strides, device};
+    using T = std::remove_const_t<decltype(t)> const;
+    // Set the data size to max as we don't know the original size of a sliced array:
+    //
+    // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results
+    // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original
+    // size 24 based on the slice.
+    linalg::TensorView<T, D> tensor{common::Span<T const>{static_cast<T *>(array.data),
+                                                          std::numeric_limits<std::size_t>::max()},
+                                    array.shape, array.strides, device};
     fn(tensor);
   };
   switch (array.type) {
@@ -613,8 +618,7 @@ void ElementWiseOp(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
     }
     case ArrayInterfaceHandler::kF16: {
       using T = long double;
-      CHECK(sizeof(long double) == 16)
-          << "128-bit floating point is not supported on current platform.";
+      CHECK(sizeof(long double) == 16) << error::NoF128();
       dispatch(T{});
       break;
     }
diff --git a/src/data/data.cc b/src/data/data.cc
index 60c0effbf2d9..9d842cfa3311 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -429,9 +429,9 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
   p_out->Reshape(array.shape);
   auto t = p_out->View(Context::kCpuId);
   CHECK(t.CContiguous());
-  ElementWiseOp(array, Context::kCpuId, [&](auto in) {
-    linalg::ElementWiseTransformHost(t, ctx.Threads(), [&](auto i, auto) {
-      return linalg::detail::Apply(in, linalg::UnravelIndex<D>(i, t.Shape()));
+  DispatchDType(array, Context::kCpuId, [&](auto in) {
+    linalg::ElementWiseTransformHost(t, 1, [&](auto i, auto) {
+      return std::apply(in, linalg::UnravelIndex<D>(i, t.Shape()));
     });
   });
 }

From 6e4fd644737b378f4baec539271fc929e7625ea6 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 26 Apr 2023 02:40:55 +0800
Subject: [PATCH 3/4] merge.

---
 src/data/array_interface.h |  7 +++----
 src/data/data.cc           | 11 ++++++-----
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index 358b9bddea9f..ce166d09a033 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -596,10 +596,9 @@ void DispatchDType(ArrayInterface<D> const array, std::int32_t device, Fn fn) {
     // Slicing an array A with shape (4, 2, 3) and stride (6, 3, 1) by [:, 1, :] results
     // in an array B with shape (4, 3) and strides (6, 1). We can't calculate the original
     // size 24 based on the slice.
-    linalg::TensorView<T, D> tensor{common::Span<T const>{static_cast<T *>(array.data),
-                                                          std::numeric_limits<std::size_t>::max()},
-                                    array.shape, array.strides, device};
-    fn(tensor);
+    fn(linalg::TensorView<T, D>{common::Span<T const>{static_cast<T *>(array.data),
+                                                      std::numeric_limits<std::size_t>::max()},
+                                array.shape, array.strides, device});
   };
   switch (array.type) {
     case ArrayInterfaceHandler::kF2: {
diff --git a/src/data/data.cc b/src/data/data.cc
index 9d842cfa3311..d611a400ec7a 100644
--- a/src/data/data.cc
+++ b/src/data/data.cc
@@ -427,11 +427,12 @@ void CopyTensorInfoImpl(Context const& ctx, Json arr_interface, linalg::Tensor<T
     return;
   }
   p_out->Reshape(array.shape);
-  auto t = p_out->View(Context::kCpuId);
-  CHECK(t.CContiguous());
-  DispatchDType(array, Context::kCpuId, [&](auto in) {
-    linalg::ElementWiseTransformHost(t, 1, [&](auto i, auto) {
-      return std::apply(in, linalg::UnravelIndex<D>(i, t.Shape()));
+  auto t_out = p_out->View(Context::kCpuId);
+  CHECK(t_out.CContiguous());
+  auto const shape = t_out.Shape();
+  DispatchDType(array, Context::kCpuId, [&](auto&& in) {
+    linalg::ElementWiseTransformHost(t_out, ctx.Threads(), [&](auto i, auto) {
+      return std::apply(in, linalg::UnravelIndex<D>(i, shape));
     });
   });
 }

From aa0dd8b29ff9f3fe66eafbae4c9e59a75f8c36e3 Mon Sep 17 00:00:00 2001
From: Jiaming Yuan <jm.yuan@outlook.com>
Date: Wed, 26 Apr 2023 02:47:34 +0800
Subject: [PATCH 4/4] cpplint.

---
 src/data/array_interface.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/data/array_interface.h b/src/data/array_interface.h
index ce166d09a033..fee22203c111 100644
--- a/src/data/array_interface.h
+++ b/src/data/array_interface.h
@@ -9,6 +9,7 @@
 #include <algorithm>
 #include <cstddef>  // for size_t
 #include <cstdint>
+#include <limits>  // for numeric_limits
 #include <map>
 #include <string>
 #include <type_traits>  // std::alignment_of,std::remove_pointer_t