rapidsai · vuule · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024 · Dec 18, 2024
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -578,7 +578,7 @@ class orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify frequency of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -733,7 +733,11 @@ class orc_writer_options {
    *
    * @param comp Compression type
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection.
@@ -865,7 +869,7 @@ class orc_writer_options_builder {
    */
   orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 
@@ -1026,7 +1030,7 @@ class chunked_orc_writer_options {
   // Specify the sink to use for writer output
   sink_info _sink;
   // Specify the compression format to use
-  compression_type _compression = compression_type::AUTO;
+  compression_type _compression = compression_type::SNAPPY;
   // Specify granularity of statistics collection
   statistics_freq _stats_freq = ORC_STATISTICS_ROW_GROUP;
   // Maximum size of each stripe (unless smaller than a single row group)
@@ -1157,7 +1161,11 @@ class chunked_orc_writer_options {
    *
    * @param comp The compression type to use
    */
-  void set_compression(compression_type comp) { _compression = comp; }
+  void set_compression(compression_type comp)
+  {
+    _compression = comp;
+    if (comp == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
+  }
 
   /**
    * @brief Choose granularity of statistics collection
@@ -1279,7 +1287,7 @@ class chunked_orc_writer_options_builder {
    */
   chunked_orc_writer_options_builder& compression(compression_type comp)
   {
-    options._compression = comp;
+    options.set_compression(comp);
     return *this;
   }
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -16,22 +16,43 @@
 
 #include "comp.hpp"
 
+#include "gpuinflate.hpp"
+#include "io/utilities/getenv_or.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 #include "nvcomp_adapter.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/cuda_memcpy.hpp>
+#include <cudf/detail/utilities/stream_pool.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/memory_resource.hpp>
 #include <cudf/utilities/span.hpp>
 
+#include <BS_thread_pool.hpp>
 #include <zlib.h>  // GZIP compression
 
 namespace cudf::io::detail {
 
 namespace {
 
+auto& h_comp_pool()
+{
+  static BS::thread_pool pool(std::thread::hardware_concurrency());
+  return pool;
+}
+
+std::optional<nvcomp::compression_type> to_nvcomp_compression(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::SNAPPY: return nvcomp::compression_type::SNAPPY;
+    case compression_type::ZSTD: return nvcomp::compression_type::ZSTD;
+    case compression_type::LZ4: return nvcomp::compression_type::LZ4;
+    case compression_type::ZLIB: return nvcomp::compression_type::DEFLATE;
+    default: return std::nullopt;
+  }
+}
+
 /**
  * @brief GZIP host compressor (includes header)
  */
@@ -98,8 +119,128 @@ std::vector<std::uint8_t> compress_snappy(host_span<uint8_t const> src,
   return cudf::detail::make_std_vector_sync<uint8_t>(d_dst, stream);
 }
 
+void device_compress(compression_type compression,
+                     device_span<device_span<uint8_t const> const> inputs,
+                     device_span<device_span<uint8_t> const> outputs,
+                     device_span<compression_result> results,
+                     rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  auto nvcomp_disabled   = nvcomp_type.has_value() ? nvcomp::is_compression_disabled(*nvcomp_type)
+                                                   : "invalid compression type";
+  if (not nvcomp_disabled) {
+    return nvcomp::batched_compress(*nvcomp_type, inputs, outputs, results, stream);
+  }
+
+  switch (compression) {
+    case compression_type::SNAPPY: return gpu_snap(inputs, outputs, results, stream);
+    default: CUDF_FAIL("Compression error: " + nvcomp_disabled.value());
+  }
+}
+
+void host_compress(compression_type compression,
+                   device_span<device_span<uint8_t const> const> inputs,
+                   device_span<device_span<uint8_t> const> outputs,
+                   device_span<compression_result> results,
+                   rmm::cuda_stream_view stream)
+{
+  if (compression == compression_type::NONE) { return; }
+
+  auto const num_blocks = inputs.size();
+  auto h_results        = cudf::detail::make_host_vector<compression_result>(num_blocks, stream);
+  auto const h_inputs   = cudf::detail::make_host_vector_async(inputs, stream);
+  auto const h_outputs  = cudf::detail::make_host_vector_async(outputs, stream);
+  stream.synchronize();
+
+  std::vector<std::future<size_t>> tasks;
+  auto streams = cudf::detail::fork_streams(stream, h_comp_pool().get_thread_count());
+  for (size_t i = 0; i < num_blocks; ++i) {
+    auto cur_stream = streams[i % streams.size()];
+    auto task = [d_in = h_inputs[i], d_out = h_outputs[i], cur_stream, compression]() -> size_t {
+      auto const h_in  = cudf::detail::make_host_vector_sync(d_in, cur_stream);
+      auto const h_out = compress(compression, h_in, cur_stream);
+      cudf::detail::cuda_memcpy<uint8_t>(d_out.subspan(0, h_out.size()), h_out, cur_stream);
+      return h_out.size();
+    };
+    tasks.emplace_back(h_comp_pool().submit_task(std::move(task)));
+  }
+
+  for (auto i = 0ul; i < num_blocks; ++i) {
+    h_results[i] = {tasks[i].get(), compression_status::SUCCESS};
+  }
+  cudf::detail::cuda_memcpy_async<compression_result>(results, h_results, stream);
+}
+
+[[nodiscard]] bool host_compression_supported(compression_type compression)
+{
+  switch (compression) {
+    case compression_type::GZIP:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool device_compression_supported(compression_type compression)
+{
+  auto const nvcomp_type = to_nvcomp_compression(compression);
+  switch (compression) {
+    case compression_type::LZ4:
+    case compression_type::ZLIB:
+    case compression_type::ZSTD: return not nvcomp::is_compression_disabled(nvcomp_type.value());
+    case compression_type::SNAPPY:
+    case compression_type::NONE: return true;
+    default: return false;
+  }
+}
+
+[[nodiscard]] bool use_host_compression(
+  compression_type compression,
+  [[maybe_unused]] device_span<device_span<uint8_t const> const> inputs,
+  [[maybe_unused]] device_span<device_span<uint8_t> const> outputs)
+{
+  CUDF_EXPECTS(
+    not host_compression_supported(compression) or device_compression_supported(compression),
+    "Unsupported compression type");
+  if (not host_compression_supported(compression)) { return false; }
+  if (not device_compression_supported(compression)) { return true; }
+  // If both host and device compression are supported, use the host if the env var is set
+  return getenv_or("LIBCUDF_USE_HOST_COMPRESSION", 0);
+}
+
 }  // namespace
 
+std::optional<size_t> compress_max_allowed_block_size(compression_type compression)
+{
+  if (auto nvcomp_type = to_nvcomp_compression(compression);
+      nvcomp_type.has_value() and not nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return nvcomp::compress_max_allowed_chunk_size(*nvcomp_type);
+  }
+  return std::nullopt;
+}
+
+[[nodiscard]] size_t compress_required_block_alignment(compression_type compression)
+{
+  auto nvcomp_type = to_nvcomp_compression(compression);
+  if (compression == compression_type::NONE or not nvcomp_type.has_value() or
+      nvcomp::is_compression_disabled(*nvcomp_type)) {
+    return 1ul;
+  }
+
+  return nvcomp::required_alignment(*nvcomp_type);
+}
+
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size)
+{
+  if (compression == compression_type::NONE) { return uncompressed_size; }
+
+  if (auto nvcomp_type = to_nvcomp_compression(compression); nvcomp_type.has_value()) {
+    return nvcomp::compress_max_output_chunk_size(*nvcomp_type, uncompressed_size);
+  }
+  CUDF_FAIL("Unsupported compression type");
+}
+
 std::vector<std::uint8_t> compress(compression_type compression,
                                    host_span<uint8_t const> src,
                                    rmm::cuda_stream_view stream)
@@ -112,4 +253,18 @@ std::vector<std::uint8_t> compress(compression_type compression,
   }
 }
 
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  if (use_host_compression(compression, inputs, outputs)) {
+    return host_compress(compression, inputs, outputs, results, stream);
+  } else {
+    return device_compress(compression, inputs, outputs, results, stream);
+  }
+}
+
 }  // namespace cudf::io::detail
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -57,5 +57,57 @@ std::vector<uint8_t> compress(compression_type compression,
                               host_span<uint8_t const> src,
                               rmm::cuda_stream_view stream);
 
+/**
+ * @brief Maximum size of uncompressed blocks that can be compressed.
+ *
+ * @param compression Compression type
+ * @returns maximum block size
+ */
+[[nodiscard]] std::optional<size_t> compress_max_allowed_block_size(compression_type compression);
+
+/**
+ * @brief Gets input and output alignment requirements for the given compression type.
+ *
+ * @param compression Compression type
+ * @returns required alignment
+ */
+[[nodiscard]] size_t compress_required_block_alignment(compression_type compression);
+
+/**
+ * @brief Gets the maximum size any chunk could compress to in the batch.
+ *
+ * @param compression Compression type
+ * @param uncompressed_size Size of the largest uncompressed chunk in the batch
+ */
+[[nodiscard]] size_t max_compressed_size(compression_type compression, uint32_t uncompressed_size);
+
+/**
+ * @brief Compresses device memory buffers.
+ *
+ * @param compression Type of compression of the input data
+ * @param inputs      Device memory buffers to compress
+ * @param outputs     Device memory buffers to store the compressed output
+ * @param results     Compression results
+ * @param stream      CUDA stream used for device memory operations and kernel launches
+ */
+void compress(compression_type compression,
+              device_span<device_span<uint8_t const> const> inputs,
+              device_span<device_span<uint8_t> const> outputs,
+              device_span<compression_result> results,
+              rmm::cuda_stream_view stream);
+
+/**
+ * @brief Aggregate results of compression into a single statistics object.
+ *
+ * @param inputs List of uncompressed input buffers
+ * @param results List of compression results
+ * @param stream CUDA stream to use
+ * @return writer_compression_statistics
+ */
+[[nodiscard]] writer_compression_statistics collect_compression_statistics(
+  device_span<device_span<uint8_t const> const> inputs,
+  device_span<compression_result const> results,
+  rmm::cuda_stream_view stream);
+
 }  // namespace io::detail
 }  // namespace CUDF_EXPORT cudf
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2018-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -124,17 +124,4 @@ void gpu_snap(device_span<device_span<uint8_t const> const> inputs,
               device_span<compression_result> results,
               rmm::cuda_stream_view stream);
 
-/**
- * @brief Aggregate results of compression into a single statistics object.
- *
- * @param inputs List of uncompressed input buffers
- * @param results List of compression results
- * @param stream CUDA stream to use
- * @return writer_compression_statistics
- */
-[[nodiscard]] writer_compression_statistics collect_compression_statistics(
-  device_span<device_span<uint8_t const> const> inputs,
-  device_span<compression_result const> results,
-  rmm::cuda_stream_view stream);
-
 }  // namespace cudf::io::detail
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2023-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2023-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include "gpuinflate.hpp"
+#include "comp.hpp"
 
 #include <rmm/exec_policy.hpp>
 

@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2025, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -766,6 +766,7 @@ void parquet_writer_options_base::set_stats_level(statistics_freq sf) { _stats_l
 void parquet_writer_options_base::set_compression(compression_type compression)
 {
   _compression = compression;
+  if (compression == compression_type::AUTO) { _compression = compression_type::SNAPPY; }
 }
 
 void parquet_writer_options_base::enable_int96_timestamps(bool req)