Skip to content

Commit

Permalink
Adding support for Decimal/Fixed-point to ORC reader (#7970)
Browse files Browse the repository at this point in the history
Added support for Decimal/fixed-point column in ORC reader along with test cases. All decimal columns would be read as Decimal64 type column, and if precision is >18, it will loudly fail. This PR also remove couple of options which are of no use after the addition of Decimal support.

#7126

Authors:
  - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu)

Approvers:
  - Devavret Makkar (https://github.com/devavret)
  - Vukasin Milovanovic (https://github.com/vuule)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #7970
  • Loading branch information
rgsl888prabhu authored Apr 20, 2021
1 parent 5c2f744 commit d501d2c
Show file tree
Hide file tree
Showing 10 changed files with 81 additions and 186 deletions.
54 changes: 0 additions & 54 deletions cpp/include/cudf/io/orc.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,6 @@ class orc_reader_options {
// Cast timestamp columns to a specific type
data_type _timestamp_type{type_id::EMPTY};

// Whether to convert decimals to float64
bool _decimals_as_float64 = true;
// For decimals as int, optional forced decimal scale;
// -1 is auto (column scale), >=0: number of fractional digits
size_type _forced_decimals_scale = -1;

friend orc_reader_options_builder;

/**
Expand Down Expand Up @@ -134,16 +128,6 @@ class orc_reader_options {
*/
data_type get_timestamp_type() const { return _timestamp_type; }

/**
* @brief Whether to convert decimals to float64.
*/
bool is_enabled_decimals_as_float64() const { return _decimals_as_float64; }

/**
* @brief Returns whether decimal scale is inferred or forced to have limited fractional digits.
*/
size_type get_forced_decimals_scale() const { return _forced_decimals_scale; }

// Setters

/**
Expand Down Expand Up @@ -207,20 +191,6 @@ class orc_reader_options {
* @param type Type of timestamp.
*/
void set_timestamp_type(data_type type) { _timestamp_type = type; }

/**
* @brief Enable/Disable conversion of decimals to float64.
*
* @param val Boolean value to enable/disable.
*/
void set_decimals_as_float64(bool val) { _decimals_as_float64 = val; }

/**
* @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
*
* @param val Length of fractional digits.
*/
void set_forced_decimals_scale(size_type val) { _forced_decimals_scale = val; }
};

class orc_reader_options_builder {
Expand Down Expand Up @@ -325,30 +295,6 @@ class orc_reader_options_builder {
return *this;
}

/**
* @brief Enable/Disable conversion of decimals to float64.
*
* @param val Boolean value to enable/disable.
* @return this for chaining.
*/
orc_reader_options_builder& decimals_as_float64(bool val)
{
options._decimals_as_float64 = val;
return *this;
}

/**
* @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
*
* @param val Length of fractional digits.
* @return this for chaining.
*/
orc_reader_options_builder& forced_decimals_scale(size_type val)
{
options._forced_decimals_scale = val;
return *this;
}

/**
* @brief move orc_reader_options member once it's built.
*/
Expand Down
7 changes: 1 addition & 6 deletions cpp/src/io/orc/orc_gpu.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -80,11 +80,6 @@ struct DictionaryEntry {
uint32_t len; // Length in data stream
};

/**
* @brief Mask to indicate conversion from decimals to float64
*/
constexpr int orc_decimal2float64_scale = 0x80;

/**
* @brief Struct to describe per stripe's column information
*/
Expand Down
39 changes: 16 additions & 23 deletions cpp/src/io/orc/reader_impl.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -50,8 +50,7 @@ namespace {
*/
constexpr type_id to_type_id(const orc::SchemaType &schema,
bool use_np_dtypes,
type_id timestamp_type_id,
bool decimals_as_float64)
type_id timestamp_type_id)
{
switch (schema.kind) {
case orc::BOOLEAN: return type_id::BOOL8;
Expand All @@ -73,9 +72,7 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
case orc::DATE:
// There isn't a (DAYS -> np.dtype) mapping
return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
case orc::DECIMAL:
// There isn't an arbitrary-precision type in cuDF, so map as float or int
return (decimals_as_float64) ? type_id::FLOAT64 : type_id::INT64;
case orc::DECIMAL: return type_id::DECIMAL64;
default: break;
}

Expand Down Expand Up @@ -406,10 +403,6 @@ reader::impl::impl(std::unique_ptr<datasource> source,

// Enable or disable the conversion to numpy-compatible dtypes
_use_np_dtypes = options.is_enabled_use_np_dtypes();

// Control decimals conversion (float64 or int64 with optional scale)
_decimals_as_float64 = options.is_enabled_decimals_as_float64();
_decimals_as_int_scale = options.get_forced_decimals_scale();
}

table_with_metadata reader::impl::read(size_type skip_rows,
Expand All @@ -432,10 +425,17 @@ table_with_metadata reader::impl::read(size_type skip_rows,
// Get a list of column data types
std::vector<data_type> column_types;
for (const auto &col : _selected_columns) {
auto col_type = to_type_id(
_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id(), _decimals_as_float64);
auto col_type = to_type_id(_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id());
CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
column_types.emplace_back(col_type);
// Remove this once we support Decimal128 data type
CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->ff.types[col].precision <= 18),
"Decimal data has precision > 18, Decimal64 data type doesn't support it.");
// sign of the scale is changed since cuDF follows c++ libraries like CNL
// which uses negative scaling, but liborc and other libraries
// follow positive scaling.
auto scale =
(col_type == type_id::DECIMAL64) ? -static_cast<int32_t>(_metadata->ff.types[col].scale) : 0;
column_types.emplace_back(col_type, scale);

// Map each ORC column to its column
orc_col_map[col] = column_types.size() - 1;
Expand Down Expand Up @@ -517,16 +517,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
chunk.num_rows = stripe_info->numberOfRows;
chunk.encoding_kind = stripe_footer->columns[_selected_columns[j]].kind;
chunk.type_kind = _metadata->ff.types[_selected_columns[j]].kind;
if (_decimals_as_float64) {
chunk.decimal_scale =
_metadata->ff.types[_selected_columns[j]].scale | orc::gpu::orc_decimal2float64_scale;
} else if (_decimals_as_int_scale < 0) {
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
} else {
chunk.decimal_scale = _decimals_as_int_scale;
}
chunk.rowgroup_id = num_rowgroups;
chunk.dtype_len = (column_types[j].id() == type_id::STRING)
chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
chunk.rowgroup_id = num_rowgroups;
chunk.dtype_len = (column_types[j].id() == type_id::STRING)
? sizeof(std::pair<const char *, size_t>)
: cudf::size_of(column_types[j]);
if (chunk.type_kind == orc::TIMESTAMP) {
Expand Down
8 changes: 3 additions & 5 deletions cpp/src/io/orc/reader_impl.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,11 +130,9 @@ class reader::impl {
std::unique_ptr<cudf::io::orc::metadata> _metadata;

std::vector<int> _selected_columns;
bool _use_index = true;
bool _use_np_dtypes = true;
bool _has_timestamp_column = false;
bool _decimals_as_float64 = true;
size_type _decimals_as_int_scale = -1;
bool _use_index = true;
bool _use_np_dtypes = true;
bool _has_timestamp_column = false;
data_type _timestamp_type{type_id::EMPTY};
};

Expand Down
70 changes: 28 additions & 42 deletions cpp/src/io/orc/stripe_data.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2020, NVIDIA CORPORATION.
* Copyright (c) 2019-2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -959,15 +959,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
return rle->num_vals;
}

/**
* @brief Powers of 10
*/
static const __device__ __constant__ double kPow10[40] = {
1.0, 1.e1, 1.e2, 1.e3, 1.e4, 1.e5, 1.e6, 1.e7, 1.e8, 1.e9, 1.e10, 1.e11, 1.e12, 1.e13,
1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39,
};

static const __device__ __constant__ int64_t kPow5i[28] = {1,
5,
25,
Expand Down Expand Up @@ -1002,6 +993,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
*
* @param[in] bs Input byte stream
* @param[in,out] vals on input: scale from secondary stream, on output: value
* @param[in] val_scale Scale of each value
* @param[in] col_scale Scale from schema to which value will be adjusted
* @param[in] numvals Number of values to decode
* @param[in] t thread id
*
Expand Down Expand Up @@ -1038,39 +1031,32 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
if (t >= num_vals_read and t < num_vals_to_read) {
auto const pos = static_cast<int>(vals.i64[t]);
int128_s v = decode_varint128(bs, pos);

if (col_scale & orc_decimal2float64_scale) {
double f = Int128ToDouble_rn(v.lo, v.hi);
int32_t scale = (t < numvals) ? val_scale : 0;
if (scale >= 0)
vals.f64[t] = f / kPow10[min(scale, 39)];
else
vals.f64[t] = f * kPow10[min(-scale, 39)];
} else {
int32_t scale = (t < numvals) ? (col_scale & ~orc_decimal2float64_scale) - val_scale : 0;
if (scale >= 0) {
scale = min(scale, 27);
vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
} else // if (scale < 0)
{
bool is_negative = (v.hi < 0);
uint64_t hi = v.hi, lo = v.lo;
scale = min(-scale, 27);
if (is_negative) {
hi = (~hi) + (lo == 0);
lo = (~lo) + 1;
}
lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
hi >>= (int32_t)scale;
if (hi != 0) {
// Use intermediate float
lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
hi = 0;
} else {
lo /= kPow5i[scale];
}
vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
// Since cuDF column stores just one scale, value needs to
// be adjusted to col_scale from val_scale. So the difference
// of them will be used to add 0s or remove digits.
int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
if (scale >= 0) {
scale = min(scale, 27);
vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
} else // if (scale < 0)
{
bool is_negative = (v.hi < 0);
uint64_t hi = v.hi, lo = v.lo;
scale = min(-scale, 27);
if (is_negative) {
hi = (~hi) + (lo == 0);
lo = (~lo) + 1;
}
lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
hi >>= (int32_t)scale;
if (hi != 0) {
// Use intermediate float
lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
hi = 0;
} else {
lo /= kPow5i[scale];
}
vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
}
}
// There is nothing to read, so break
Expand Down
6 changes: 0 additions & 6 deletions python/cudf/cudf/_lib/cpp/io/orc.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -34,8 +34,6 @@ cdef extern from "cudf/io/orc.hpp" \
void enable_use_index(bool val) except+
void enable_use_np_dtypes(bool val) except+
void set_timestamp_type(data_type type) except+
void enable_decimals_as_float64(bool val) except+
void set_forced_decimals_scale(size_type scale) except+

@staticmethod
orc_reader_options_builder builder(
Expand All @@ -53,10 +51,6 @@ cdef extern from "cudf/io/orc.hpp" \
orc_reader_options_builder& use_index(bool val) except+
orc_reader_options_builder& use_np_dtypes(bool val) except+
orc_reader_options_builder& timestamp_type(data_type type) except+
orc_reader_options_builder& decimals_as_float64(bool val) except+
orc_reader_options_builder& forced_decimals_scale(
size_type scale
) except+

orc_reader_options build() except+

Expand Down
10 changes: 1 addition & 9 deletions python/cudf/cudf/_lib/orc.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -71,8 +71,6 @@ cpdef read_orc(object filepath_or_buffer,
object skip_rows=None,
object num_rows=None,
bool use_index=True,
bool decimals_as_float=True,
object force_decimal_scale=None,
object timestamp_type=None):
"""
Cython function to call into libcudf API, see `read_orc`.
Expand All @@ -96,9 +94,7 @@ cpdef read_orc(object filepath_or_buffer,
)
)
),
use_index,
decimals_as_float,
get_size_t_arg(force_decimal_scale, "force_decimal_scale")
use_index
)

cdef table_with_metadata c_result
Expand Down Expand Up @@ -173,8 +169,6 @@ cdef orc_reader_options make_orc_reader_options(
size_type num_rows,
type_id timestamp_type,
bool use_index,
bool decimals_as_float,
size_type force_decimal_scale
) except*:

cdef vector[string] c_column_names
Expand All @@ -192,8 +186,6 @@ cdef orc_reader_options make_orc_reader_options(
.num_rows(num_rows)
.timestamp_type(data_type(timestamp_type))
.use_index(use_index)
.decimals_as_float64(decimals_as_float)
.forced_decimals_scale(force_decimal_scale)
.build()
)

Expand Down
4 changes: 0 additions & 4 deletions python/cudf/cudf/io/orc.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,8 +223,6 @@ def read_orc(
skiprows=None,
num_rows=None,
use_index=True,
decimals_as_float=True,
force_decimal_scale=None,
timestamp_type=None,
**kwargs,
):
Expand Down Expand Up @@ -266,8 +264,6 @@ def read_orc(
skiprows,
num_rows,
use_index,
decimals_as_float,
force_decimal_scale,
timestamp_type,
)
)
Expand Down
Binary file not shown.
Loading

0 comments on commit d501d2c

Please sign in to comment.