Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[REVIEW] Fix cudf::strings:split logic for many columns #4922

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
0d045e5
move split-record fn to separate source file
davidwendt Apr 14, 2020
e6f0d35
recoded split/rsplit for perf
davidwendt Apr 16, 2020
c18eb2c
fix test::print for strings columns with no nullmask
davidwendt Apr 16, 2020
7b6ead3
add more tests using maxsplits parm
davidwendt Apr 16, 2020
6dcfc91
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 16, 2020
fa3135a
update comments and remove prints
davidwendt Apr 16, 2020
cc96b68
fix all-null special case
davidwendt Apr 16, 2020
43fde57
hates compile warnings
davidwendt Apr 17, 2020
36659c1
workaround compile segfault
davidwendt Apr 17, 2020
33dea6c
update changelog
davidwendt Apr 17, 2020
e225d7c
fix changelog entry
davidwendt Apr 17, 2020
a709fa7
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 17, 2020
0cb9b11
add base class for split/rsplit fns
davidwendt Apr 17, 2020
41c0eff
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 17, 2020
9340e8a
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 17, 2020
fb114f1
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 20, 2020
9832fef
handle empty column
davidwendt Apr 20, 2020
fa59935
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 20, 2020
bb8cb7b
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 21, 2020
772237c
fix merge conflicts from clang format
davidwendt Apr 23, 2020
3f0706b
fix merge conflict from clang format again
davidwendt Apr 23, 2020
0eb2392
clang-format new file
davidwendt Apr 23, 2020
f2a8142
add tests for overlapped delimiter
davidwendt Apr 23, 2020
789a37a
update parameter name in declarations
davidwendt Apr 23, 2020
520e29c
fix comment
davidwendt Apr 23, 2020
26fb2f1
one more test string
davidwendt Apr 23, 2020
80188e0
forgot clang format
davidwendt Apr 23, 2020
a258cf3
add factory with iterator parms
davidwendt Apr 24, 2020
b1bc0dd
place tokens in column order; use new iterator factory
davidwendt Apr 24, 2020
90f98a9
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 24, 2020
5bc4ede
Merge branch 'branch-0.14' into perf-strings-split-to-many-columns
davidwendt Apr 24, 2020
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@
- PR #4899 Fix series inplace handling
- PR #4940 Fix boolean mask issue with large sized Dataframe
- PR #4889 Fix multi-index merging
- PR #4922 Fix cudf::strings:split logic for many columns
- PR #4949 Fix scatter, gather benchmark constructor call
- PR #4965 Raise Error when there are duplicate columns sent to `cudf.concat`
- PR #4984 Fix groupby nth aggregation negative n and exclude nulls
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -632,6 +632,7 @@ add_library(cudf
src/strings/sorting/sorting.cu
src/strings/split/partition.cu
src/strings/split/split.cu
src/strings/split/split_record.cu
src/strings/strings_column_factories.cu
src/strings/strings_column_view.cu
src/strings/strings_scalar_factories.cpp
Expand Down
104 changes: 104 additions & 0 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
@@ -0,0 +1,104 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/valid_if.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/utilities/error.hpp>
#include <strings/utilities.cuh>

#include <rmm/thrust_rmm_allocator.h>
#include <thrust/for_each.h>
#include <thrust/transform_reduce.h>

// clang-format off
namespace cudf {
namespace strings {
namespace detail {

// Create a strings-type column from vector of pointer/size pairs
template<typename IndexPairIterator>
std::unique_ptr<column> make_strings_column(
IndexPairIterator begin, IndexPairIterator end,
rmm::mr::device_memory_resource* mr,
cudaStream_t stream )
{
CUDF_FUNC_RANGE();
size_type strings_count = thrust::distance(begin,end);
if (strings_count == 0) return strings::detail::make_empty_strings_column(mr, stream);

using string_index_pair = thrust::pair<const char*, size_type>;

auto execpol = rmm::exec_policy(stream);
// check total size is not too large for cudf column
size_t bytes = thrust::transform_reduce(
execpol->on(stream), begin, end,
[] __device__(string_index_pair const& item) {
return (item.first != nullptr) ? item.second : 0;
},
0,
thrust::plus<size_t>());
CUDF_EXPECTS(bytes < std::numeric_limits<size_type>::max(),
"total size of strings is too large for cudf column");

// build offsets column from the strings sizes
auto offsets_transformer = [begin] __device__(size_type idx) {
string_index_pair const item = begin[idx];
return (item.first != nullptr ? static_cast<int32_t>(item.second) : 0);
};
auto offsets_transformer_itr = thrust::make_transform_iterator(
thrust::make_counting_iterator<size_type>(0), offsets_transformer);
auto offsets_column = strings::detail::make_offsets_child_column(
offsets_transformer_itr, offsets_transformer_itr + strings_count, mr, stream);
auto d_offsets = offsets_column->view().template data<int32_t>();

// create null mask
auto new_nulls = experimental::detail::valid_if( begin, end,
[] __device__(string_index_pair const item) { return item.first != nullptr; },
stream,
mr);
auto null_count = new_nulls.second;
rmm::device_buffer null_mask;
if (null_count > 0) null_mask = std::move(new_nulls.first);

// build chars column
auto chars_column =
strings::detail::create_chars_child_column(strings_count, null_count, bytes, mr, stream);
auto d_chars = chars_column->mutable_view().template data<char>();
thrust::for_each_n(execpol->on(stream),
thrust::make_counting_iterator<size_type>(0),
strings_count,
[begin, d_offsets, d_chars] __device__(size_type idx) {
string_index_pair const item = begin[idx];
if (item.first != nullptr)
memcpy(d_chars + d_offsets[idx], item.first, item.second);
});

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
null_count,
std::move(null_mask),
stream,
mr);
}

} // namespace detail
} // namespace strings
} // namespace cudf
// clang-format on TODO fix
8 changes: 4 additions & 4 deletions cpp/include/cudf/strings/split/split.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ namespace strings {
*
* Any null string entries return corresponding null output columns.
*
* @param strings Strings instance for this operation.
* @param strings_column Strings instance for this operation.
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
Expand All @@ -44,7 +44,7 @@ namespace strings {
* @return New table of strings columns.
*/
std::unique_ptr<experimental::table> split(
strings_column_view const& strings,
strings_column_view const& strings_column,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
Expand All @@ -63,7 +63,7 @@ std::unique_ptr<experimental::table> split(
*
* Any null string entries return corresponding null output columns.
*
* @param strings Strings instance for this operation.
* @param strings_column Strings instance for this operation.
* @param delimiter UTF-8 encoded string indentifying the split points in each string.
* Default of empty string indicates split on whitespace.
* @param maxsplit Maximum number of splits to perform.
Expand All @@ -72,7 +72,7 @@ std::unique_ptr<experimental::table> split(
* @return New strings columns.
*/
std::unique_ptr<experimental::table> rsplit(
strings_column_view const& strings,
strings_column_view const& strings_column,
string_scalar const& delimiter = string_scalar(""),
size_type maxsplit = -1,
rmm::mr::device_memory_resource* mr = rmm::mr::get_default_resource());
Expand Down
3 changes: 1 addition & 2 deletions cpp/src/rolling/rolling.cu
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,7 @@ struct rolling_window_launcher {
// and that's why nullify_out_of_bounds/ignore_out_of_bounds is true.
auto output_table =
detail::gather(table_view{{input}}, output->view(), false, true, false, mr, stream);
return std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
;
output = std::make_unique<cudf::column>(std::move(output_table->get_column(0)));
}

return output;
Expand Down
Loading