Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add from_arrow_host functions for cudf interop with nanoarrow #15645

Merged
merged 32 commits into from
May 29, 2024
Merged
Show file tree
Hide file tree
Changes from 25 commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
cf3c818
Add `from_arrow_device_host` functions for cudf interop with nanoarrow
zeroshade May 3, 2024
5e06b19
rename functions and lint
zeroshade May 6, 2024
79414d1
from feedback
zeroshade May 6, 2024
a05e788
back to original impl
zeroshade May 8, 2024
05edc9a
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 9, 2024
d58909f
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 14, 2024
7bfc8d4
updates from feedback and including extra comments
zeroshade May 14, 2024
e2a3c6b
Update cpp/src/interop/from_arrow_host.cu
zeroshade May 15, 2024
e2065cd
updates from feedback
zeroshade May 15, 2024
ff90122
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 15, 2024
7e117a8
removing includes by suggestion
zeroshade May 20, 2024
32ae306
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 20, 2024
5ac8d6c
updates from feedback
zeroshade May 21, 2024
679993c
add `from_arrow` overload
zeroshade May 21, 2024
8610c91
remove excess comment
zeroshade May 21, 2024
fc02aed
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 22, 2024
2f49293
shift function from to_arrow_utilities to just arrow_utilities
zeroshade May 22, 2024
5b57bb3
fix style problems
zeroshade May 22, 2024
923c422
forward declare ArrowArray
zeroshade May 22, 2024
d429004
fix forgotten view usage
zeroshade May 22, 2024
c0a3c4e
add `from_arrow_column` overload
zeroshade May 22, 2024
6fa921e
Merge branch 'branch-24.06' into from-arrow-device-host
zeroshade May 23, 2024
4fa83d0
fix test and lint
zeroshade May 23, 2024
ad28303
refactor and shift tests
zeroshade May 23, 2024
14f39e4
Merge branch 'branch-24.06' into from-arrow-device-host
vyasr May 23, 2024
4b4c887
fix build issues
zeroshade May 23, 2024
f357ad7
style fixes
zeroshade May 24, 2024
b11d846
fix expected exception
zeroshade May 24, 2024
0cbcfaf
Merge branch 'branch-24.06' into from-arrow-device-host
vyasr May 24, 2024
7203197
Update cpp/src/interop/from_arrow_host.cu
zeroshade May 28, 2024
08a8b54
Merge branch 'branch-24.08' into from-arrow-device-host
vyasr May 28, 2024
a7b078b
Merge branch 'branch-24.08' into from-arrow-device-host
vyasr May 29, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 2 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -360,11 +360,12 @@ add_library(
src/hash/xxhash_64.cu
src/interop/dlpack.cpp
src/interop/from_arrow.cu
src/interop/arrow_utilities.cpp
src/interop/to_arrow.cu
src/interop/to_arrow_device.cu
src/interop/from_arrow_device.cu
src/interop/from_arrow_host.cu
src/interop/to_arrow_schema.cpp
src/interop/to_arrow_utilities.cpp
src/interop/detail/arrow_allocator.cpp
src/io/avro/avro.cpp
src/io/avro/avro_gpu.cu
Expand Down
91 changes: 89 additions & 2 deletions cpp/include/cudf/interop.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ struct ArrowDeviceArray;

struct ArrowSchema;

struct ArrowArray;

namespace cudf {
/**
* @addtogroup interop_dlpack
Expand Down Expand Up @@ -348,6 +350,91 @@ std::unique_ptr<cudf::scalar> from_arrow(
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::device_async_resource_ref mr = rmm::mr::get_current_device_resource());

/**
* @brief Create `cudf::table` from given ArrowArray and ArrowSchema input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws cudf::data_type_error if the input array is not a struct array.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowArray` pointer that needs to be converted to cudf::table
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate `cudf::table`
* @return cudf table generated from given arrow data
*/
std::unique_ptr<cudf::table> from_arrow(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create `cudf::column` from a given ArrowArray and ArrowSchema input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowArray` pointer that needs to be converted to cudf::column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate `cudf::column`
* @return cudf column generated from given arrow data
*/
std::unique_ptr<cudf::column> from_arrow_column(ArrowSchema const* schema,
ArrowArray const* input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

/**
* @brief Create `cudf::table` from given ArrowDeviceArray input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
*
* @throws cudf::data_type_error if the input array is not a struct array,
* non-struct arrays should be passed to `from_arrow_host_column` instead.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform cuda allocation
* @return cudf table generated from the given Arrow data
*/
std::unique_ptr<table> from_arrow_host(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
* @brief Create `cudf::column` from given ArrowDeviceArray input
*
* @throws std::invalid_argument if either schema or input are NULL
*
* @throws std::invalid_argument if the device_type is not `ARROW_DEVICE_CPU`
*
* @throws cudf::data_type_error if input arrow data type is not supported in cudf.
*
* The conversion will not call release on the input Array.
*
* @param schema `ArrowSchema` pointer to describe the type of the data
* @param input `ArrowDeviceArray` pointer to object owning the Arrow data
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to perform cuda allocation
* @return cudf column generated from the given Arrow data
*/
std::unique_ptr<column> from_arrow_host_column(
ArrowSchema const* schema,
ArrowDeviceArray const* input,
rmm::cuda_stream_view stream = cudf::get_default_stream(),
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief typedef for a vector of owning columns, used for conversion from ArrowDeviceArray
*
Expand Down Expand Up @@ -398,7 +485,7 @@ using unique_table_view_t =
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::table_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
zeroshade marked this conversation as resolved.
Show resolved Hide resolved
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error if the input array is not a struct array, non-struct
Expand Down Expand Up @@ -446,7 +533,7 @@ using unique_column_view_t =
* `ArrowDeviceArray` after it is no longer needed, and that the `cudf::column_view` is not
* accessed after this happens.
*
* @throws cudf::logic_error if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* @throws std::invalid_argument if device_type is not `ARROW_DEVICE_CUDA`, `ARROW_DEVICE_CUDA_HOST`
* or `ARROW_DEVICE_CUDA_MANAGED`
*
* @throws cudf::data_type_error input arrow data type is not supported.
Expand Down
90 changes: 90 additions & 0 deletions cpp/src/interop/arrow_utilities.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "arrow_utilities.hpp"

#include <cudf/types.hpp>
#include <cudf/utilities/error.hpp>

#include <nanoarrow/nanoarrow.h>

namespace cudf {
namespace detail {
data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view)
{
switch (arrow_view->type) {
case NANOARROW_TYPE_NA: return data_type(type_id::EMPTY);
case NANOARROW_TYPE_BOOL: return data_type(type_id::BOOL8);
case NANOARROW_TYPE_INT8: return data_type(type_id::INT8);
case NANOARROW_TYPE_INT16: return data_type(type_id::INT16);
case NANOARROW_TYPE_INT32: return data_type(type_id::INT32);
case NANOARROW_TYPE_INT64: return data_type(type_id::INT64);
case NANOARROW_TYPE_UINT8: return data_type(type_id::UINT8);
case NANOARROW_TYPE_UINT16: return data_type(type_id::UINT16);
case NANOARROW_TYPE_UINT32: return data_type(type_id::UINT32);
case NANOARROW_TYPE_UINT64: return data_type(type_id::UINT64);
case NANOARROW_TYPE_FLOAT: return data_type(type_id::FLOAT32);
case NANOARROW_TYPE_DOUBLE: return data_type(type_id::FLOAT64);
case NANOARROW_TYPE_DATE32: return data_type(type_id::TIMESTAMP_DAYS);
case NANOARROW_TYPE_STRING: return data_type(type_id::STRING);
case NANOARROW_TYPE_LIST: return data_type(type_id::LIST);
case NANOARROW_TYPE_DICTIONARY: return data_type(type_id::DICTIONARY32);
case NANOARROW_TYPE_STRUCT: return data_type(type_id::STRUCT);
case NANOARROW_TYPE_TIMESTAMP: {
switch (arrow_view->time_unit) {
case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::TIMESTAMP_SECONDS);
case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::TIMESTAMP_MILLISECONDS);
case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::TIMESTAMP_MICROSECONDS);
case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::TIMESTAMP_NANOSECONDS);
default: CUDF_FAIL("Unsupported timestamp unit in arrow", cudf::data_type_error);
}
}
case NANOARROW_TYPE_DURATION: {
switch (arrow_view->time_unit) {
case NANOARROW_TIME_UNIT_SECOND: return data_type(type_id::DURATION_SECONDS);
case NANOARROW_TIME_UNIT_MILLI: return data_type(type_id::DURATION_MILLISECONDS);
case NANOARROW_TIME_UNIT_MICRO: return data_type(type_id::DURATION_MICROSECONDS);
case NANOARROW_TIME_UNIT_NANO: return data_type(type_id::DURATION_NANOSECONDS);
default: CUDF_FAIL("Unsupported duration unit in arrow", cudf::data_type_error);
}
}
case NANOARROW_TYPE_DECIMAL128:
return data_type{type_id::DECIMAL128, -arrow_view->decimal_scale};
default: CUDF_FAIL("Unsupported type_id conversion to cudf", cudf::data_type_error);
}
}

ArrowType id_to_arrow_type(cudf::type_id id)
{
switch (id) {
case cudf::type_id::BOOL8: return NANOARROW_TYPE_BOOL;
case cudf::type_id::INT8: return NANOARROW_TYPE_INT8;
case cudf::type_id::INT16: return NANOARROW_TYPE_INT16;
case cudf::type_id::INT32: return NANOARROW_TYPE_INT32;
case cudf::type_id::INT64: return NANOARROW_TYPE_INT64;
case cudf::type_id::UINT8: return NANOARROW_TYPE_UINT8;
case cudf::type_id::UINT16: return NANOARROW_TYPE_UINT16;
case cudf::type_id::UINT32: return NANOARROW_TYPE_UINT32;
case cudf::type_id::UINT64: return NANOARROW_TYPE_UINT64;
case cudf::type_id::FLOAT32: return NANOARROW_TYPE_FLOAT;
case cudf::type_id::FLOAT64: return NANOARROW_TYPE_DOUBLE;
case cudf::type_id::TIMESTAMP_DAYS: return NANOARROW_TYPE_DATE32;
default: CUDF_FAIL("Unsupported type_id conversion to arrow type", cudf::data_type_error);
}
}

} // namespace detail
} // namespace cudf
21 changes: 21 additions & 0 deletions cpp/src/interop/arrow_utilities.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@

#pragma once

#include <cudf/types.hpp>

#include <nanoarrow/nanoarrow.h>
#include <nanoarrow/nanoarrow_types.h>

namespace cudf {
namespace detail {

Expand All @@ -26,5 +31,21 @@ namespace detail {
static constexpr int validity_buffer_idx = 0;
static constexpr int fixed_width_data_buffer_idx = 1;

/**
* @brief Map ArrowType id to cudf column type id
*
* @param arrow_view SchemaView to pull the logical and storage types from
* @return Column type id
*/
data_type arrow_to_cudf_type(const ArrowSchemaView* arrow_view);
vyasr marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Map cudf column type id to ArrowType id
*
* @param id Column type id
* @return ArrowType id
*/
ArrowType id_to_arrow_type(cudf::type_id id);

} // namespace detail
} // namespace cudf
Loading
Loading