From 62c4f99f79852a95f61f241c884e598c9164331d Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Fri, 30 Jun 2023 17:18:26 -0400
Subject: [PATCH] Fix memcheck error found in nvtext tokenize functions
 (#13649)

Fixes memcheck error found by the nightly build in the nvtext `characters_tokenizer` utility function.
```
[ RUN      ] TextNgramsTokenizeTest.Tokenize
========= Invalid __global__ read of size 1 bytes
=========     at 0x2360 in void cub::CUB_101702_610_860_NS::DeviceScanKernel<cub::CUB_101702_610_860_NS::DeviceScanPolicy<int>::Policy600, thrust::cuda_cub::transform_input_iterator_t<int, thrust::counting_iterator<int, thrust::use_default, thrust::use_default, thrust::use_default>, nvtext::detail::strings_tokenizer>, int *, cub::CUB_101702_610_860_NS::ScanTileState<int, (bool)1>, thrust::plus<int>, cub::CUB_101702_610_860_NS::NullType, int>(T2, T3, T4, int, T5, T6, T7)
=========     by thread (5,0,0) in block (0,0,0)
=========     Address 0x7f67a0200a65 is out of bounds
=========     and is 1 bytes after the nearest allocation at 0x7f67a0200a00 of size 101 bytes
=========     Saved host backtrace up to driver entry point at kernel launch time
=========     Host Frame: [0x30b492]
=========                in /usr/lib/x86_64-linux-gnu/libcuda.so.1
=========     Host Frame: [0x1488c]
=========                in /conda/envs/rapids/lib/libcudart.so.11.0
=========     Host Frame:cudaLaunchKernel [0x6c318]
=========                in /conda/envs/rapids/lib/libcudart.so.11.0
=========     Host Frame:nvtext::detail::ngrams_tokenize(cudf::strings_column_view const&, int, cudf::string_scalar const&, cudf::string_scalar const&, rmm::cuda_stream_view, rmm::mr::device_memory_resource*) [0x2693cc9]
=========                in /conda/envs/rapids/lib/libcudf.so

```
This error was introduced by changes in #13480

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13649
---
 cpp/src/text/utilities/tokenize_ops.cuh | 1 +
 1 file changed, 1 insertion(+)
diff --git a/cpp/src/text/utilities/tokenize_ops.cuh b/cpp/src/text/utilities/tokenize_ops.cuh
index 003c041c0bf..89825e31e5c 100644
--- a/cpp/src/text/utilities/tokenize_ops.cuh
+++ b/cpp/src/text/utilities/tokenize_ops.cuh
@@ -89,6 +89,7 @@ struct characters_tokenizer {
   __device__ bool next_token()
   {
     auto const src_ptr = d_str.data();
+    if (current_position >= d_str.size_bytes()) { return false; }
     if (current_position != 0) {  // skip these 2 lines the first time through
       current_position += cudf::strings::detail::bytes_in_char_utf8(src_ptr[current_position]);
       start_position = current_position;