From c7b13fd26a7947454c94559546bcc801acd026ba Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Thu, 15 Aug 2024 23:44:12 +0000 Subject: [PATCH 1/9] Initial commit for benchmark for PQ writer dict encoding --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 46d2927a92b..6bad89206a8 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -198,6 +198,13 @@ using stats_list = nvbench::enum_type_list; +NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) + .set_name("parquet_write_dict_encode") + .set_type_axes_names({"data_type"}) + .set_min_samples(4) + .add_int64_axis("cardinality", {1,10,100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("run_length", {32}); + NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") .set_type_axes_names({"data_type"}) From f255fba954b470ae584cf3966d49e817bc266104 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Sat, 17 Aug 2024 01:57:02 +0000 Subject: [PATCH 2/9] Updates --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 6bad89206a8..c03b527c509 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -48,8 +48,9 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = cudf::io::compression_type::SNAPPY; - auto const sink_type = io_type::VOID; + auto compression = cudf::io::compression_type::NONE; + if (state.get_int64("compression")) { compression = cudf::io::compression_type::SNAPPY; } + auto const sink_type = io_type::VOID; auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), @@ -202,14 +203,16 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_dict_encode") .set_type_axes_names({"data_type"}) .set_min_samples(4) - .add_int64_axis("cardinality", {1,10,100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) - .add_int64_axis("run_length", {32}); + .add_int64_axis("cardinality", {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) + .add_int64_axis("compression", {0}) + .add_int64_axis("run_length", {0, 32, 64}); NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") .set_type_axes_names({"data_type"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) + .add_int64_axis("compression", {0}) .add_int64_axis("run_length", {1, 32}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) From c17a4b969530fa0792e7339bebfff17db4561b97 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:25:21 -0700 Subject: [PATCH 3/9] Minor updates --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index c03b527c509..3f98e3fff07 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -212,7 +212,7 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_type_axes_names({"data_type"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("compression", {0}) + .add_int64_axis("compression", {1}) .add_int64_axis("run_length", {1, 32}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) From 1535fee150254ce7531400fe97df1ddf6a656f89 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 19 Aug 2024 14:25:58 -0700 Subject: [PATCH 4/9] Update run length --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 3f98e3fff07..fb5c2178f47 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -205,7 +205,7 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_min_samples(4) .add_int64_axis("cardinality", {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) .add_int64_axis("compression", {0}) - .add_int64_axis("run_length", {0, 32, 64}); + .add_int64_axis("run_length", {1, 32, 64}); NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") From a29fb5f6133067f9c0d116bf4f1e3414d935b6cb Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 19 Aug 2024 21:33:38 +0000 Subject: [PATCH 5/9] Updates --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index fb5c2178f47..a80505e7f31 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -48,8 +48,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto compression = cudf::io::compression_type::NONE; - if (state.get_int64("compression")) { compression = cudf::io::compression_type::SNAPPY; } + auto const compression = cudf::io::compression_type::SNAPPY; auto const sink_type = io_type::VOID; auto const tbl = @@ -199,21 +198,12 @@ using stats_list = nvbench::enum_type_list; -NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) - .set_name("parquet_write_dict_encode") - .set_type_axes_names({"data_type"}) - .set_min_samples(4) - .add_int64_axis("cardinality", {1, 10, 100, 1'000, 10'000, 100'000, 1'000'000, 10'000'000}) - .add_int64_axis("compression", {0}) - .add_int64_axis("run_length", {1, 32, 64}); - NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") .set_type_axes_names({"data_type"}) .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000}) - .add_int64_axis("compression", {1}) - .add_int64_axis("run_length", {1, 32}); + .add_int64_axis("cardinality", {1, 1000, 10'000, 100'000, 1'000'000}) + .add_int64_axis("run_length", {1, 32, 64}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) .set_name("parquet_write_io_compression") From 0cc2bd6f52e970f3a88233dc2b1ad43dcf3c707a Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Mon, 19 Aug 2024 21:36:43 +0000 Subject: [PATCH 6/9] Revert erroneous white space change --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index a80505e7f31..44e002fc993 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -48,8 +48,8 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list(DataType)); cudf::size_type const cardinality = state.get_int64("cardinality"); cudf::size_type const run_length = state.get_int64("run_length"); - auto const compression = cudf::io::compression_type::SNAPPY; - auto const sink_type = io_type::VOID; + auto const compression = cudf::io::compression_type::SNAPPY; + auto const sink_type = io_type::VOID; auto const tbl = create_random_table(cycle_dtypes(data_types, num_cols), From b00850d97b9bba8f514398fc03d737e839c904b6 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb Date: Fri, 6 Sep 2024 22:55:07 +0000 Subject: [PATCH 7/9] Code review updates --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 44e002fc993..2f939cf1130 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -202,7 +202,7 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") .set_type_axes_names({"data_type"}) .set_min_samples(4) - .add_int64_axis("cardinality", {1, 1000, 10'000, 100'000, 1'000'000}) + .add_int64_axis("cardinality", {0, 1000, 10'000}) .add_int64_axis("run_length", {1, 32, 64}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) From fef3179ffd4e1f24e7ebff9fab938c1b59db39e4 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Fri, 6 Sep 2024 18:32:16 -0700 Subject: [PATCH 8/9] Add 100K cardinality --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 2f939cf1130..6cefdbaec2b 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -202,7 +202,7 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_name("parquet_write_encode") .set_type_axes_names({"data_type"}) .set_min_samples(4) - .add_int64_axis("cardinality", {0, 1000, 10'000}) + .add_int64_axis("cardinality", {0, 1000, 10'000, 100'000}) .add_int64_axis("run_length", {1, 32, 64}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) From bfe3b13d6750687196c326d791e1336e26eeceb1 Mon Sep 17 00:00:00 2001 From: Muhammad Haseeb <14217455+mhaseeb123@users.noreply.github.com> Date: Mon, 9 Sep 2024 11:48:40 -0700 Subject: [PATCH 9/9] Updates from code review --- cpp/benchmarks/io/parquet/parquet_writer.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp index 6cefdbaec2b..256e50f0e64 100644 --- a/cpp/benchmarks/io/parquet/parquet_writer.cpp +++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp @@ -203,7 +203,7 @@ NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list)) .set_type_axes_names({"data_type"}) .set_min_samples(4) .add_int64_axis("cardinality", {0, 1000, 10'000, 100'000}) - .add_int64_axis("run_length", {1, 32, 64}); + .add_int64_axis("run_length", {1, 8, 32}); NVBENCH_BENCH_TYPES(BM_parq_write_io_compression, NVBENCH_TYPE_AXES(io_list, compression_list)) .set_name("parquet_write_io_compression")