From 62c4f99f79852a95f61f241c884e598c9164331d Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Fri, 30 Jun 2023 17:18:26 -0400 Subject: [PATCH] Fix memcheck error found in nvtext tokenize functions (#13649) Fixes memcheck error found by the nightly build in the nvtext `characters_tokenizer` utility function. ``` [ RUN ] TextNgramsTokenizeTest.Tokenize ========= Invalid __global__ read of size 1 bytes ========= at 0x2360 in void cub::CUB_101702_610_860_NS::DeviceScanKernel::Policy600, thrust::cuda_cub::transform_input_iterator_t, nvtext::detail::strings_tokenizer>, int *, cub::CUB_101702_610_860_NS::ScanTileState, thrust::plus, cub::CUB_101702_610_860_NS::NullType, int>(T2, T3, T4, int, T5, T6, T7) ========= by thread (5,0,0) in block (0,0,0) ========= Address 0x7f67a0200a65 is out of bounds ========= and is 1 bytes after the nearest allocation at 0x7f67a0200a00 of size 101 bytes ========= Saved host backtrace up to driver entry point at kernel launch time ========= Host Frame: [0x30b492] ========= in /usr/lib/x86_64-linux-gnu/libcuda.so.1 ========= Host Frame: [0x1488c] ========= in /conda/envs/rapids/lib/libcudart.so.11.0 ========= Host Frame:cudaLaunchKernel [0x6c318] ========= in /conda/envs/rapids/lib/libcudart.so.11.0 ========= Host Frame:nvtext::detail::ngrams_tokenize(cudf::strings_column_view const&, int, cudf::string_scalar const&, cudf::string_scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) [0x2693cc9] ========= in /conda/envs/rapids/lib/libcudf.so ``` This error was introduced by changes in #13480 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13649 --- cpp/src/text/utilities/tokenize_ops.cuh | 1 + 1 file changed, 1 insertion(+) diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh index 003c041c0bf..89825e31e5c 100644 --- a/cpp/src/text/utilities/tokenize_ops.cuh +++ b/cpp/src/text/utilities/tokenize_ops.cuh @@ -89,6 +89,7 @@ struct characters_tokenizer { __device__ bool next_token() { auto const src_ptr = d_str.data(); + if (current_position >= d_str.size_bytes()) { return false; } if (current_position != 0) { // skip these 2 lines the first time through current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]); start_position = current_position;