Adding support for Decimal/Fixed-point to ORC reader (#7970)

Added support for Decimal/fixed-point column in ORC reader along with test cases. All decimal columns would be read as Decimal64 type column, and if precision is >18, it will loudly fail. This PR also remove couple of options which are of no use after the addition of Decimal support. #7126 Authors: - Ram (Ramakrishna Prabhu) (https://github.com/rgsl888prabhu) Approvers: - Devavret Makkar (https://github.com/devavret) - Vukasin Milovanovic (https://github.com/vuule) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #7970
rapidsai · Apr 20, 2021 · d501d2c · d501d2c
1 parent 5c2f744
commit d501d2c
Show file tree

Hide file tree

Showing 10 changed files with 81 additions and 186 deletions.
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
@@ -63,12 +63,6 @@ class orc_reader_options {
   // Cast timestamp columns to a specific type
   data_type _timestamp_type{type_id::EMPTY};
 
-  // Whether to convert decimals to float64
-  bool _decimals_as_float64 = true;
-  // For decimals as int, optional forced decimal scale;
-  // -1 is auto (column scale), >=0: number of fractional digits
-  size_type _forced_decimals_scale = -1;
-
   friend orc_reader_options_builder;
 
   /**
@@ -134,16 +128,6 @@ class orc_reader_options {
    */
   data_type get_timestamp_type() const { return _timestamp_type; }
 
-  /**
-   * @brief Whether to convert decimals to float64.
-   */
-  bool is_enabled_decimals_as_float64() const { return _decimals_as_float64; }
-
-  /**
-   * @brief Returns whether decimal scale is inferred or forced to have limited fractional digits.
-   */
-  size_type get_forced_decimals_scale() const { return _forced_decimals_scale; }
-
   // Setters
 
   /**
@@ -207,20 +191,6 @@ class orc_reader_options {
    * @param type Type of timestamp.
    */
   void set_timestamp_type(data_type type) { _timestamp_type = type; }
-
-  /**
-   * @brief Enable/Disable conversion of decimals to float64.
-   *
-   * @param val Boolean value to enable/disable.
-   */
-  void set_decimals_as_float64(bool val) { _decimals_as_float64 = val; }
-
-  /**
-   * @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
-   *
-   * @param val Length of fractional digits.
-   */
-  void set_forced_decimals_scale(size_type val) { _forced_decimals_scale = val; }
 };
 
 class orc_reader_options_builder {
@@ -325,30 +295,6 @@ class orc_reader_options_builder {
     return *this;
   }
 
-  /**
-   * @brief Enable/Disable conversion of decimals to float64.
-   *
-   * @param val Boolean value to enable/disable.
-   * @return this for chaining.
-   */
-  orc_reader_options_builder& decimals_as_float64(bool val)
-  {
-    options._decimals_as_float64 = val;
-    return *this;
-  }
-
-  /**
-   * @brief Sets whether decimal scale is inferred or forced to have limited fractional digits.
-   *
-   * @param val Length of fractional digits.
-   * @return this for chaining.
-   */
-  orc_reader_options_builder& forced_decimals_scale(size_type val)
-  {
-    options._forced_decimals_scale = val;
-    return *this;
-  }
-
   /**
    * @brief move orc_reader_options member once it's built.
    */

diff --git a/cpp/src/io/orc/orc_gpu.h b/cpp/src/io/orc/orc_gpu.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,11 +80,6 @@ struct DictionaryEntry {
   uint32_t len;  // Length in data stream
 };
 
-/**
- * @brief Mask to indicate conversion from decimals to float64
- */
-constexpr int orc_decimal2float64_scale = 0x80;
-
 /**
  * @brief Struct to describe per stripe's column information
  */

diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -50,8 +50,7 @@ namespace {
  */
 constexpr type_id to_type_id(const orc::SchemaType &schema,
                              bool use_np_dtypes,
-                             type_id timestamp_type_id,
-                             bool decimals_as_float64)
+                             type_id timestamp_type_id)
 {
   switch (schema.kind) {
     case orc::BOOLEAN: return type_id::BOOL8;
@@ -73,9 +72,7 @@ constexpr type_id to_type_id(const orc::SchemaType &schema,
     case orc::DATE:
       // There isn't a (DAYS -> np.dtype) mapping
       return (use_np_dtypes) ? type_id::TIMESTAMP_MILLISECONDS : type_id::TIMESTAMP_DAYS;
-    case orc::DECIMAL:
-      // There isn't an arbitrary-precision type in cuDF, so map as float or int
-      return (decimals_as_float64) ? type_id::FLOAT64 : type_id::INT64;
+    case orc::DECIMAL: return type_id::DECIMAL64;
     default: break;
   }
 
@@ -406,10 +403,6 @@ reader::impl::impl(std::unique_ptr<datasource> source,
 
   // Enable or disable the conversion to numpy-compatible dtypes
   _use_np_dtypes = options.is_enabled_use_np_dtypes();
-
-  // Control decimals conversion (float64 or int64 with optional scale)
-  _decimals_as_float64   = options.is_enabled_decimals_as_float64();
-  _decimals_as_int_scale = options.get_forced_decimals_scale();
 }
 
 table_with_metadata reader::impl::read(size_type skip_rows,
@@ -432,10 +425,17 @@ table_with_metadata reader::impl::read(size_type skip_rows,
   // Get a list of column data types
   std::vector<data_type> column_types;
   for (const auto &col : _selected_columns) {
-    auto col_type = to_type_id(
-      _metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id(), _decimals_as_float64);
+    auto col_type = to_type_id(_metadata->ff.types[col], _use_np_dtypes, _timestamp_type.id());
     CUDF_EXPECTS(col_type != type_id::EMPTY, "Unknown type");
-    column_types.emplace_back(col_type);
+    // Remove this once we support Decimal128 data type
+    CUDF_EXPECTS((col_type != type_id::DECIMAL64) or (_metadata->ff.types[col].precision <= 18),
+                 "Decimal data has precision > 18, Decimal64 data type doesn't support it.");
+    // sign of the scale is changed since cuDF follows c++ libraries like CNL
+    // which uses negative scaling, but liborc and other libraries
+    // follow positive scaling.
+    auto scale =
+      (col_type == type_id::DECIMAL64) ? -static_cast<int32_t>(_metadata->ff.types[col].scale) : 0;
+    column_types.emplace_back(col_type, scale);
 
     // Map each ORC column to its column
     orc_col_map[col] = column_types.size() - 1;
@@ -517,16 +517,9 @@ table_with_metadata reader::impl::read(size_type skip_rows,
         chunk.num_rows      = stripe_info->numberOfRows;
         chunk.encoding_kind = stripe_footer->columns[_selected_columns[j]].kind;
         chunk.type_kind     = _metadata->ff.types[_selected_columns[j]].kind;
-        if (_decimals_as_float64) {
-          chunk.decimal_scale =
-            _metadata->ff.types[_selected_columns[j]].scale | orc::gpu::orc_decimal2float64_scale;
-        } else if (_decimals_as_int_scale < 0) {
-          chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
-        } else {
-          chunk.decimal_scale = _decimals_as_int_scale;
-        }
-        chunk.rowgroup_id = num_rowgroups;
-        chunk.dtype_len   = (column_types[j].id() == type_id::STRING)
+        chunk.decimal_scale = _metadata->ff.types[_selected_columns[j]].scale;
+        chunk.rowgroup_id   = num_rowgroups;
+        chunk.dtype_len     = (column_types[j].id() == type_id::STRING)
                             ? sizeof(std::pair<const char *, size_t>)
                             : cudf::size_of(column_types[j]);
         if (chunk.type_kind == orc::TIMESTAMP) {

diff --git a/cpp/src/io/orc/reader_impl.hpp b/cpp/src/io/orc/reader_impl.hpp
@@ -130,11 +130,9 @@ class reader::impl {
   std::unique_ptr<cudf::io::orc::metadata> _metadata;
 
   std::vector<int> _selected_columns;
-  bool _use_index                  = true;
-  bool _use_np_dtypes              = true;
-  bool _has_timestamp_column       = false;
-  bool _decimals_as_float64        = true;
-  size_type _decimals_as_int_scale = -1;
+  bool _use_index            = true;
+  bool _use_np_dtypes        = true;
+  bool _has_timestamp_column = false;
   data_type _timestamp_type{type_id::EMPTY};
 };
 

diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2021, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -959,15 +959,6 @@ static __device__ uint32_t Byte_RLE(orc_bytestream_s *bs,
   return rle->num_vals;
 }
 
-/**
- * @brief Powers of 10
- */
-static const __device__ __constant__ double kPow10[40] = {
-  1.0,   1.e1,  1.e2,  1.e3,  1.e4,  1.e5,  1.e6,  1.e7,  1.e8,  1.e9,  1.e10, 1.e11, 1.e12, 1.e13,
-  1.e14, 1.e15, 1.e16, 1.e17, 1.e18, 1.e19, 1.e20, 1.e21, 1.e22, 1.e23, 1.e24, 1.e25, 1.e26, 1.e27,
-  1.e28, 1.e29, 1.e30, 1.e31, 1.e32, 1.e33, 1.e34, 1.e35, 1.e36, 1.e37, 1.e38, 1.e39,
-};
-
 static const __device__ __constant__ int64_t kPow5i[28] = {1,
                                                            5,
                                                            25,
@@ -1002,6 +993,8 @@ static const __device__ __constant__ int64_t kPow5i[28] = {1,
  *
  * @param[in] bs Input byte stream
  * @param[in,out] vals on input: scale from secondary stream, on output: value
+ * @param[in] val_scale Scale of each value
+ * @param[in] col_scale Scale from schema to which value will be adjusted
  * @param[in] numvals Number of values to decode
  * @param[in] t thread id
  *
@@ -1038,39 +1031,32 @@ static __device__ int Decode_Decimals(orc_bytestream_s *bs,
     if (t >= num_vals_read and t < num_vals_to_read) {
       auto const pos = static_cast<int>(vals.i64[t]);
       int128_s v     = decode_varint128(bs, pos);
-
-      if (col_scale & orc_decimal2float64_scale) {
-        double f      = Int128ToDouble_rn(v.lo, v.hi);
-        int32_t scale = (t < numvals) ? val_scale : 0;
-        if (scale >= 0)
-          vals.f64[t] = f / kPow10[min(scale, 39)];
-        else
-          vals.f64[t] = f * kPow10[min(-scale, 39)];
-      } else {
-        int32_t scale = (t < numvals) ? (col_scale & ~orc_decimal2float64_scale) - val_scale : 0;
-        if (scale >= 0) {
-          scale       = min(scale, 27);
-          vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
-        } else  // if (scale < 0)
-        {
-          bool is_negative = (v.hi < 0);
-          uint64_t hi = v.hi, lo = v.lo;
-          scale = min(-scale, 27);
-          if (is_negative) {
-            hi = (~hi) + (lo == 0);
-            lo = (~lo) + 1;
-          }
-          lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
-          hi >>= (int32_t)scale;
-          if (hi != 0) {
-            // Use intermediate float
-            lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
-            hi = 0;
-          } else {
-            lo /= kPow5i[scale];
-          }
-          vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
+      // Since cuDF column stores just one scale, value needs to
+      // be adjusted to col_scale from val_scale. So the difference
+      // of them will be used to add 0s or remove digits.
+      int32_t scale = (t < numvals) ? col_scale - val_scale : 0;
+      if (scale >= 0) {
+        scale       = min(scale, 27);
+        vals.i64[t] = ((int64_t)v.lo * kPow5i[scale]) << scale;
+      } else  // if (scale < 0)
+      {
+        bool is_negative = (v.hi < 0);
+        uint64_t hi = v.hi, lo = v.lo;
+        scale = min(-scale, 27);
+        if (is_negative) {
+          hi = (~hi) + (lo == 0);
+          lo = (~lo) + 1;
+        }
+        lo = (lo >> (uint32_t)scale) | ((uint64_t)hi << (64 - scale));
+        hi >>= (int32_t)scale;
+        if (hi != 0) {
+          // Use intermediate float
+          lo = __double2ull_rn(Int128ToDouble_rn(lo, hi) / __ll2double_rn(kPow5i[scale]));
+          hi = 0;
+        } else {
+          lo /= kPow5i[scale];
         }
+        vals.i64[t] = (is_negative) ? -(int64_t)lo : (int64_t)lo;
       }
     }
     // There is nothing to read, so break

diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -34,8 +34,6 @@ cdef extern from "cudf/io/orc.hpp" \
         void enable_use_index(bool val) except+
         void enable_use_np_dtypes(bool val) except+
         void set_timestamp_type(data_type type) except+
-        void enable_decimals_as_float64(bool val) except+
-        void set_forced_decimals_scale(size_type scale) except+
 
         @staticmethod
         orc_reader_options_builder builder(
@@ -53,10 +51,6 @@ cdef extern from "cudf/io/orc.hpp" \
         orc_reader_options_builder& use_index(bool val) except+
         orc_reader_options_builder& use_np_dtypes(bool val) except+
         orc_reader_options_builder& timestamp_type(data_type type) except+
-        orc_reader_options_builder& decimals_as_float64(bool val) except+
-        orc_reader_options_builder& forced_decimals_scale(
-            size_type scale
-        ) except+
 
         orc_reader_options build() except+
 

diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
@@ -71,8 +71,6 @@ cpdef read_orc(object filepath_or_buffer,
                object skip_rows=None,
                object num_rows=None,
                bool use_index=True,
-               bool decimals_as_float=True,
-               object force_decimal_scale=None,
                object timestamp_type=None):
     """
     Cython function to call into libcudf API, see `read_orc`.
@@ -96,9 +94,7 @@ cpdef read_orc(object filepath_or_buffer,
                 )
             )
         ),
-        use_index,
-        decimals_as_float,
-        get_size_t_arg(force_decimal_scale, "force_decimal_scale")
+        use_index
     )
 
     cdef table_with_metadata c_result
@@ -173,8 +169,6 @@ cdef orc_reader_options make_orc_reader_options(
     size_type num_rows,
     type_id timestamp_type,
     bool use_index,
-    bool decimals_as_float,
-    size_type force_decimal_scale
 ) except*:
 
     cdef vector[string] c_column_names
@@ -192,8 +186,6 @@ cdef orc_reader_options make_orc_reader_options(
         .num_rows(num_rows)
         .timestamp_type(data_type(timestamp_type))
         .use_index(use_index)
-        .decimals_as_float64(decimals_as_float)
-        .forced_decimals_scale(force_decimal_scale)
         .build()
     )
 

diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
@@ -223,8 +223,6 @@ def read_orc(
     skiprows=None,
     num_rows=None,
     use_index=True,
-    decimals_as_float=True,
-    force_decimal_scale=None,
     timestamp_type=None,
     **kwargs,
 ):
@@ -266,8 +264,6 @@ def read_orc(
                 skiprows,
                 num_rows,
                 use_index,
-                decimals_as_float,
-                force_decimal_scale,
                 timestamp_type,
             )
         )

diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.int_decimal.precision_19.orc