From bc7a1cc1f1a19dae9bab79f11a443c7d9bd69936 Mon Sep 17 00:00:00 2001
From: Allison Vacanti <alliepiper16@gmail.com>
Date: Tue, 29 Mar 2022 15:19:06 -0400
Subject: [PATCH] Alias deprecated TexRefInputIterator to TexObjInputIterator.

Removes usage of deprecated CUDART texture reference APIs without
breaking CUB's API.

Other changes:
- Remove obsolete checks for CUDART_VERSION >= 5.5.
- Split `test_iterator.cu` to create a test that only handles the
  deprecated `TexRefInputIterator` API.
  - Reduces the scope of our deprecation suppressions.
  - Also removed the deprecation suppression from our CMake logic.
- Enable testing for `TexObjInputIterator` without `CUB_CDP`.
- Add a meaningful implementation for `ostream << TexObjInputIterator`.
---
 cmake/CubBuildCompilerTargets.cmake     |  10 -
 cub/iterator/tex_obj_input_iterator.cuh |   3 +
 cub/iterator/tex_ref_input_iterator.cuh | 340 +-----------------------
 test/test_iterator.cu                   |  81 +-----
 test/test_iterator_deprecated.cu        | 306 +++++++++++++++++++++
 5 files changed, 316 insertions(+), 424 deletions(-)
 create mode 100644 test/test_iterator_deprecated.cu
diff --git a/cmake/CubBuildCompilerTargets.cmake b/cmake/CubBuildCompilerTargets.cmake
index 398dba0ad4..45b640ecbc 100644
--- a/cmake/CubBuildCompilerTargets.cmake
+++ b/cmake/CubBuildCompilerTargets.cmake
@@ -57,11 +57,6 @@ function(cub_build_compiler_targets)
 
     # This complains about functions in CUDA system headers when used with nvcc.
     append_option_if_available("-Wno-unused-function" cxx_compile_options)
-
-    # CUB uses deprecated texture functions (cudaBindTexture, etc). These
-    # need to be replaced, but silence the warnings for now.
-    # This can be removed once NVIDIA/cub#191 is fixed.
-    append_option_if_available("-Wno-deprecated-declarations" cxx_compile_options)
   endif()
 
   if ("GNU" STREQUAL "${CMAKE_CXX_COMPILER_ID}")
@@ -115,10 +110,5 @@ function(cub_build_compiler_targets)
     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Xcudafe=--promote_warnings>
     # Don't complain about deprecated GPU targets.
     $<$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>:-Wno-deprecated-gpu-targets>
-    # Suppress deprecation warnings in nvcc < 11.5.
-    # TexRefInputIterator uses deprecated CUDART APIs, see NVIDIA/cub#191.
-    # After 11.5, we will suppress these in-code via pragma, but for older nvcc
-    # we have to use the big hammer:
-    $<$<AND:$<COMPILE_LANG_AND_ID:CUDA,NVIDIA>,$<VERSION_LESS:$<CUDA_COMPILER_VERSION>,11.5>>:-Wno-deprecated-declarations>
   )
 endfunction()
diff --git a/cub/iterator/tex_obj_input_iterator.cuh b/cub/iterator/tex_obj_input_iterator.cuh
index e289eefcf8..e7659d43c9 100644
--- a/cub/iterator/tex_obj_input_iterator.cuh
+++ b/cub/iterator/tex_obj_input_iterator.cuh
@@ -306,6 +306,9 @@ public:
     /// ostream operator
     friend std::ostream& operator<<(std::ostream& os, const self_type& itr)
     {
+        os << "cub::TexObjInputIterator( ptr=" << itr.ptr
+           << ", offset=" << itr.tex_offset
+           << ", tex_obj=" << itr.tex_obj << " )";
         return os;
     }
 
diff --git a/cub/iterator/tex_ref_input_iterator.cuh b/cub/iterator/tex_ref_input_iterator.cuh
index ecb9b5be4f..bcabe2c9b3 100644
--- a/cub/iterator/tex_ref_input_iterator.cuh
+++ b/cub/iterator/tex_ref_input_iterator.cuh
@@ -33,146 +33,20 @@
 
 #pragma once
 
-#include <iterator>
-#include <iostream>
+#include <cub/config.cuh>
+#include <cub/iterator/tex_obj_input_iterator.cuh>
 
-#include "../thread/thread_load.cuh"
-#include "../thread/thread_store.cuh"
-#include "../util_device.cuh"
-#include "../util_debug.cuh"
-#include "../config.cuh"
-
-#if (CUDART_VERSION >= 5050) || defined(DOXYGEN_ACTIVE)  // This iterator is compatible with CUDA 5.5 and newer
-
-#if (THRUST_VERSION >= 100700)    // This iterator is compatible with Thrust API 1.7 and newer
-    #include <thrust/iterator/iterator_facade.h>
-    #include <thrust/iterator/iterator_traits.h>
-#endif // THRUST_VERSION
+#include <cstddef>
 
 CUB_NAMESPACE_BEGIN
 
-
-/******************************************************************************
- * Static file-scope Tesla/Fermi-style texture references
- *****************************************************************************/
-
-#ifndef DOXYGEN_SHOULD_SKIP_THIS    // Do not document
-
-// Anonymous namespace
-namespace {
-
-/// Global texture reference specialized by type
-template <typename T>
-struct CUB_DEPRECATED IteratorTexRef
-{
-
-// This class uses the deprecated cudaBindTexture / cudaUnbindTexture APIs.
-// See issue NVIDIA/cub#191.
-// Turn off deprecation warnings when compiling class implementation in favor
-// of deprecating TexRefInputIterator instead.
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#pragma warning(disable:4996)
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \
-      CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 1215
-#endif
-
-    /// And by unique ID
-    template <int UNIQUE_ID>
-    struct TexId
-    {
-        // Largest texture word we can use in device
-        typedef typename UnitWord<T>::DeviceWord DeviceWord;
-        typedef typename UnitWord<T>::TextureWord TextureWord;
-
-        // Number of texture words per T
-        enum {
-            DEVICE_MULTIPLE = sizeof(T) / sizeof(DeviceWord),
-            TEXTURE_MULTIPLE = sizeof(T) / sizeof(TextureWord)
-        };
-
-        // Texture reference type
-        typedef texture<TextureWord> TexRef;
-
-        // Texture reference
-        static TexRef ref;
-
-        /// Bind texture
-        static cudaError_t BindTexture(void *d_in, size_t &bytes, size_t &offset)
-        {
-            if (d_in)
-            {
-                cudaChannelFormatDesc tex_desc = cudaCreateChannelDesc<TextureWord>();
-                ref.channelDesc = tex_desc;
-                return (CubDebug(cudaBindTexture(&offset, ref, d_in, bytes)));
-            }
-
-            return cudaSuccess;
-        }
-
-        /// Unbind texture
-        static cudaError_t UnbindTexture()
-        {
-            return CubDebug(cudaUnbindTexture(ref));
-        }
-
-        /// Fetch element
-        template <typename Distance>
-        static __device__ __forceinline__ T Fetch(Distance tex_offset)
-        {
-            DeviceWord temp[DEVICE_MULTIPLE];
-            TextureWord *words = reinterpret_cast<TextureWord*>(temp);
-
-            #pragma unroll
-            for (int i = 0; i < TEXTURE_MULTIPLE; ++i)
-            {
-                words[i] = tex1Dfetch(ref, (tex_offset * TEXTURE_MULTIPLE) + i);
-            }
-
-            return reinterpret_cast<T&>(temp);
-        }
-    };
-};
-
-// Texture reference definitions
-template <typename  T>
-template <int       UNIQUE_ID>
-typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>::template TexId<UNIQUE_ID>::ref = 0;
-
-// Re-enable deprecation warnings:
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diagnostic pop
-#endif
-
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#pragma warning(default:4996)
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \
-      CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
-#pragma GCC diagnostic pop
-#endif
-
-} // Anonymous namespace
-
-
-#endif // DOXYGEN_SHOULD_SKIP_THIS
-
-
-
 /**
  * \addtogroup UtilIterator
  * @{
  */
 
-
-
 /**
- * \brief A random-access input wrapper for dereferencing array values through texture cache.  Uses older Tesla/Fermi-style texture references.
+ * \brief A random-access input wrapper for dereferencing array values through texture cache.
  *
  * \deprecated [Since 1.13.0] The CUDA texture management APIs used by
  * TexRefInputIterator are deprecated. Use cub::TexObjInputIterator instead.
@@ -225,212 +99,10 @@ typename IteratorTexRef<T>::template TexId<UNIQUE_ID>::TexRef IteratorTexRef<T>:
  */
 template <
     typename    T,
-    int         UNIQUE_ID,
+    int         /*UNIQUE_ID*/,
     typename    OffsetT = ptrdiff_t>
-class CUB_DEPRECATED TexRefInputIterator
-{
-
-// This class uses the deprecated cudaBindTexture / cudaUnbindTexture APIs.
-// See issue NVIDIA/cub#191.
-// Turn off deprecation warnings when compiling class implementation in favor
-// of deprecating TexRefInputIterator instead.
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#pragma warning(disable:4996)
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \
-      CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wdeprecated-declarations"
-#endif
-
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diagnostic push
-#pragma nv_diag_suppress 1215
-#endif
-
-public:
-
-    // Required iterator traits
-    typedef TexRefInputIterator                 self_type;              ///< My own type
-    typedef OffsetT                             difference_type;        ///< Type to express the result of subtracting one iterator from another
-    typedef T                                   value_type;             ///< The type of the element the iterator can point to
-    typedef T*                                  pointer;                ///< The type of a pointer to an element the iterator can point to
-    typedef T                                   reference;              ///< The type of a reference to an element the iterator can point to
-
-#if (THRUST_VERSION >= 100700)
-    // Use Thrust's iterator categories so we can use these iterators in Thrust 1.7 (or newer) methods
-    typedef typename THRUST_NS_QUALIFIER::detail::iterator_facade_category<
-        THRUST_NS_QUALIFIER::device_system_tag,
-        THRUST_NS_QUALIFIER::random_access_traversal_tag,
-        value_type,
-        reference
-      >::type iterator_category;                                        ///< The iterator category
-#else
-    typedef std::random_access_iterator_tag     iterator_category;      ///< The iterator category
-#endif  // THRUST_VERSION
-
-private:
-
-    T*              ptr;
-    difference_type tex_offset;
-
-    // Texture reference wrapper (old Tesla/Fermi-style textures)
-    typedef typename IteratorTexRef<T>::template TexId<UNIQUE_ID> TexId;
-
-public:
-/*
-    /// Constructor
-    __host__ __device__ __forceinline__ TexRefInputIterator()
-    :
-        ptr(NULL),
-        tex_offset(0)
-    {}
-*/
-    /// Use this iterator to bind \p ptr with a texture reference
-    template <typename QualifiedT>
-    cudaError_t BindTexture(
-        QualifiedT      *ptr,                   ///< Native pointer to wrap that is aligned to cudaDeviceProp::textureAlignment
-        size_t          bytes,                  ///< Number of bytes in the range
-        size_t          tex_offset = 0)         ///< OffsetT (in items) from \p ptr denoting the position of the iterator
-    {
-        this->ptr = const_cast<typename std::remove_cv<QualifiedT>::type *>(ptr);
-        size_t offset;
-        cudaError_t retval = TexId::BindTexture(this->ptr + tex_offset, bytes, offset);
-        this->tex_offset = (difference_type) (offset / sizeof(QualifiedT));
-        return retval;
-    }
-
-    /// Unbind this iterator from its texture reference
-    cudaError_t UnbindTexture()
-    {
-        return TexId::UnbindTexture();
-    }
-
-    /// Postfix increment
-    __host__ __device__ __forceinline__ self_type operator++(int)
-    {
-        self_type retval = *this;
-        tex_offset++;
-        return retval;
-    }
-
-    /// Prefix increment
-    __host__ __device__ __forceinline__ self_type operator++()
-    {
-        tex_offset++;
-        return *this;
-    }
-
-    /// Indirection
-    __host__ __device__ __forceinline__ reference operator*() const
-    {
-        if (CUB_IS_HOST_CODE) {
-            // Simply dereference the pointer on the host
-            return ptr[tex_offset];
-        } else {
-            #if CUB_INCLUDE_DEVICE_CODE
-                // Use the texture reference
-                return TexId::Fetch(tex_offset);
-            #else
-                // This is dead code that will never be executed.  It is here
-                // only to avoid warnings about missing returns.
-                return ptr[tex_offset];
-            #endif
-        }
-    }
-
-    /// Addition
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator+(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset + n;
-        return retval;
-    }
-
-    /// Addition assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator+=(Distance n)
-    {
-        tex_offset += n;
-        return *this;
-    }
-
-    /// Subtraction
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type operator-(Distance n) const
-    {
-        self_type retval;
-        retval.ptr = ptr;
-        retval.tex_offset = tex_offset - n;
-        return retval;
-    }
-
-    /// Subtraction assignment
-    template <typename Distance>
-    __host__ __device__ __forceinline__ self_type& operator-=(Distance n)
-    {
-        tex_offset -= n;
-        return *this;
-    }
-
-    /// Distance
-    __host__ __device__ __forceinline__ difference_type operator-(self_type other) const
-    {
-        return tex_offset - other.tex_offset;
-    }
-
-    /// Array subscript
-    template <typename Distance>
-    __host__ __device__ __forceinline__ reference operator[](Distance n) const
-    {
-        self_type offset = (*this) + n;
-        return *offset;
-    }
-
-    /// Structure dereference
-    __host__ __device__ __forceinline__ pointer operator->()
-    {
-        return &(*(*this));
-    }
-
-    /// Equal to
-    __host__ __device__ __forceinline__ bool operator==(const self_type& rhs)
-    {
-        return ((ptr == rhs.ptr) && (tex_offset == rhs.tex_offset));
-    }
-
-    /// Not equal to
-    __host__ __device__ __forceinline__ bool operator!=(const self_type& rhs)
-    {
-        return ((ptr != rhs.ptr) || (tex_offset != rhs.tex_offset));
-    }
-
-    /// ostream operator
-    friend std::ostream& operator<<(std::ostream& os, const self_type& /*itr*/)
-    {
-        return os;
-    }
-
-// Re-enable deprecation warnings:
-
-#ifdef __NVCC_DIAG_PRAGMA_SUPPORT__
-#pragma nv_diagnostic pop
-#endif
-
-#if CUB_HOST_COMPILER == CUB_HOST_COMPILER_MSVC
-#pragma warning(default:4996)
-#elif CUB_HOST_COMPILER == CUB_HOST_COMPILER_GCC || \
-      CUB_HOST_COMPILER == CUB_HOST_COMPILER_CLANG
-#pragma GCC diagnostic pop
-#endif
-
-};
-
-
+using TexRefInputIterator CUB_DEPRECATED = cub::TexObjInputIterator<T, OffsetT>;
 
 /** @} */       // end group UtilIterator
 
 CUB_NAMESPACE_END
-
-#endif // CUDART_VERSION
diff --git a/test/test_iterator.cu b/test/test_iterator.cu
index 1eb6f10730..1b5a6c6d03 100644
--- a/test/test_iterator.cu
+++ b/test/test_iterator.cu
@@ -43,7 +43,6 @@
 #include <cub/iterator/constant_input_iterator.cuh>
 #include <cub/iterator/counting_input_iterator.cuh>
 #include <cub/iterator/tex_obj_input_iterator.cuh>
-#include <cub/iterator/tex_ref_input_iterator.cuh>
 #include <cub/iterator/transform_input_iterator.cuh>
 
 #include <cub/util_type.cuh>
@@ -374,70 +373,6 @@ void TestTexObj()
     if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
 }
 
-
-#if CUDART_VERSION >= 5050
-
-/**
- * Test tex-ref texture iterator
- */
-template <typename T, typename CastT>
-void TestTexRef()
-{
-    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
-
-    //
-    // Test iterator manipulation in kernel
-    //
-
-    constexpr int TEST_VALUES                   = 11000;
-    constexpr unsigned int DUMMY_OFFSET         = 500;
-    constexpr unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
-
-    T *h_data = new T[TEST_VALUES];
-    for (int i = 0; i < TEST_VALUES; ++i)
-    {
-        RandomBits(h_data[i]);
-    }
-
-    // Allocate device arrays
-    T *d_data   = NULL;
-    T *d_dummy  = NULL;
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
-
-    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
-
-    // Initialize reference data
-    T h_reference[8];
-    h_reference[0] = h_data[0];          // Value at offset 0
-    h_reference[1] = h_data[100];        // Value at offset 100
-    h_reference[2] = h_data[1000];       // Value at offset 1000
-    h_reference[3] = h_data[10000];      // Value at offset 10000
-    h_reference[4] = h_data[1];          // Value at offset 1
-    h_reference[5] = h_data[21];         // Value at offset 21
-    h_reference[6] = h_data[11];         // Value at offset 11
-    h_reference[7] = h_data[0];          // Value at offset 0;
-
-    // Create and bind ref-based test iterator
-    TexRefInputIterator<T, __LINE__> d_ref_itr;
-    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
-
-    // Create and bind dummy iterator of same type to check with interferance
-    TexRefInputIterator<T, __LINE__> d_ref_itr2;
-    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
-
-    Test(d_ref_itr, h_reference);
-
-    CubDebugExit(d_ref_itr.UnbindTexture());
-    CubDebugExit(d_ref_itr2.UnbindTexture());
-
-    if (h_data) delete[] h_data;
-    if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
-    if (d_dummy) CubDebugExit(g_allocator.DeviceFree(d_dummy));
-}
-
-
 /**
  * Test texture transform iterator
  */
@@ -492,11 +427,6 @@ void TestTexTransform()
     if (d_data) CubDebugExit(g_allocator.DeviceFree(d_data));
 }
 
-#endif  // CUDART_VERSION
-
-
-
-
 /**
  * Run non-integer tests
  */
@@ -505,17 +435,8 @@ void Test(Int2Type<false> /* is_integer */)
 {
     TestModified<T, CastT>();
     TestTransform<T, CastT>();
-
-#if CUB_CDP
-    // Test tex-obj iterators if CUDA dynamic parallelism enabled
-    TestTexObj<T, CastT>(type_string);
-#endif  // CUB_CDP
-
-#if CUDART_VERSION >= 5050
-    // Test tex-ref iterators for CUDA 5.5
-    TestTexRef<T, CastT>();
+    TestTexObj<T, CastT>();
     TestTexTransform<T, CastT>();
-#endif  // CUDART_VERSION
 }
 
 /**
diff --git a/test/test_iterator_deprecated.cu b/test/test_iterator_deprecated.cu
new file mode 100644
index 0000000000..b42febe51b
--- /dev/null
+++ b/test/test_iterator_deprecated.cu
@@ -0,0 +1,306 @@
+/******************************************************************************
+ * Copyright (c) 2011, Duane Merrill.  All rights reserved.
+ * Copyright (c) 2011-2018, NVIDIA CORPORATION.  All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions are met:
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in the
+ *       documentation and/or other materials provided with the distribution.
+ *     * Neither the name of the NVIDIA CORPORATION nor the
+ *       names of its contributors may be used to endorse or promote products
+ *       derived from this software without specific prior written permission.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
+ * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
+ * WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+ * DISCLAIMED. IN NO EVENT SHALL NVIDIA CORPORATION BE LIABLE FOR ANY
+ * DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
+ * (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
+ * LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
+ * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
+ * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ *
+ ******************************************************************************/
+
+/******************************************************************************
+ * Test of iterator utilities
+ ******************************************************************************/
+
+// Ensure printing of CUDA runtime errors to console
+#define CUB_STDERR
+
+// This file tests deprecated CUB APIs. Silence deprecation warnings:
+#define CUB_IGNORE_DEPRECATED_API
+
+#include <cub/iterator/tex_ref_input_iterator.cuh>
+#include <cub/util_type.cuh>
+#include <cub/util_allocator.cuh>
+
+#include <iterator>
+#include <cstdio>
+#include <typeinfo>
+
+#include "test_util.h"
+
+using namespace cub;
+
+//---------------------------------------------------------------------
+// Globals, constants and typedefs
+//---------------------------------------------------------------------
+
+bool                    g_verbose = false;
+CachingDeviceAllocator  g_allocator(true);
+
+//---------------------------------------------------------------------
+// Test kernels
+//---------------------------------------------------------------------
+
+/**
+ * Test random access input iterator
+ */
+template <
+    typename InputIteratorT,
+    typename T>
+__global__ void Kernel(
+    InputIteratorT    d_in,
+    T                 *d_out,
+    InputIteratorT    *d_itrs)
+{
+    d_out[0] = *d_in;               // Value at offset 0
+    d_out[1] = d_in[100];           // Value at offset 100
+    d_out[2] = *(d_in + 1000);      // Value at offset 1000
+    d_out[3] = *(d_in + 10000);     // Value at offset 10000
+
+    d_in++;
+    d_out[4] = d_in[0];             // Value at offset 1
+
+    d_in += 20;
+    d_out[5] = d_in[0];             // Value at offset 21
+    d_itrs[0] = d_in;               // Iterator at offset 21
+
+    d_in -= 10;
+    d_out[6] = d_in[0];             // Value at offset 11;
+
+    d_in -= 11;
+    d_out[7] = d_in[0];             // Value at offset 0
+    d_itrs[1] = d_in;               // Iterator at offset 0
+}
+
+
+
+//---------------------------------------------------------------------
+// Host testing subroutines
+//---------------------------------------------------------------------
+
+
+/**
+ * Run iterator test on device
+ */
+template <
+    typename        InputIteratorT,
+    typename        T,
+    int             TEST_VALUES>
+void Test(
+    InputIteratorT  d_in,
+    T               (&h_reference)[TEST_VALUES])
+{
+    // Allocate device arrays
+    T                 *d_out    = NULL;
+    InputIteratorT    *d_itrs   = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_out,     sizeof(T) * TEST_VALUES));
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_itrs,    sizeof(InputIteratorT) * 2));
+
+    int compare;
+
+    // Run unguarded kernel
+    Kernel<<<1, 1>>>(d_in, d_out, d_itrs);
+
+    CubDebugExit(cudaPeekAtLastError());
+    CubDebugExit(cudaDeviceSynchronize());
+
+    // Check results
+    compare = CompareDeviceResults(h_reference, d_out, TEST_VALUES, g_verbose, g_verbose);
+    printf("\tValues: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 21
+    InputIteratorT h_itr = d_in + 21;
+    compare = CompareDeviceResults(&h_itr, d_itrs, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Check iterator at offset 0
+    compare = CompareDeviceResults(&d_in, d_itrs + 1, 1, g_verbose, g_verbose);
+    printf("\tIterators: %s\n", (compare) ? "FAIL" : "PASS");
+    AssertEquals(0, compare);
+
+    // Cleanup
+    if (d_out)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_out));
+    }
+    if (d_itrs)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_itrs));
+    }
+}
+
+/**
+ * Test tex-ref texture iterator
+ */
+template <typename T, typename CastT>
+void TestTexRef()
+{
+    printf("\nTesting tex-ref iterator on type %s\n", typeid(T).name()); fflush(stdout);
+
+    //
+    // Test iterator manipulation in kernel
+    //
+
+    constexpr int TEST_VALUES                   = 11000;
+    constexpr unsigned int DUMMY_OFFSET         = 500;
+    constexpr unsigned int DUMMY_TEST_VALUES    = TEST_VALUES - DUMMY_OFFSET;
+
+    T *h_data = new T[TEST_VALUES];
+    for (int i = 0; i < TEST_VALUES; ++i)
+    {
+        RandomBits(h_data[i]);
+    }
+
+    // Allocate device arrays
+    T *d_data   = NULL;
+    T *d_dummy  = NULL;
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_data, sizeof(T) * TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_data, h_data, sizeof(T) * TEST_VALUES, cudaMemcpyHostToDevice));
+
+    CubDebugExit(g_allocator.DeviceAllocate((void**)&d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+    CubDebugExit(cudaMemcpy(d_dummy, h_data + DUMMY_OFFSET, sizeof(T) * DUMMY_TEST_VALUES, cudaMemcpyHostToDevice));
+
+    // Initialize reference data
+    T h_reference[8];
+    h_reference[0] = h_data[0];          // Value at offset 0
+    h_reference[1] = h_data[100];        // Value at offset 100
+    h_reference[2] = h_data[1000];       // Value at offset 1000
+    h_reference[3] = h_data[10000];      // Value at offset 10000
+    h_reference[4] = h_data[1];          // Value at offset 1
+    h_reference[5] = h_data[21];         // Value at offset 21
+    h_reference[6] = h_data[11];         // Value at offset 11
+    h_reference[7] = h_data[0];          // Value at offset 0;
+
+    // Create and bind ref-based test iterator
+    TexRefInputIterator<T, __LINE__> d_ref_itr;
+    CubDebugExit(d_ref_itr.BindTexture((CastT*) d_data, sizeof(T) * TEST_VALUES));
+
+    // Create and bind dummy iterator of same type to check with interferance
+    TexRefInputIterator<T, __LINE__> d_ref_itr2;
+    CubDebugExit(d_ref_itr2.BindTexture((CastT*) d_dummy, sizeof(T) * DUMMY_TEST_VALUES));
+
+    Test(d_ref_itr, h_reference);
+
+    CubDebugExit(d_ref_itr.UnbindTexture());
+    CubDebugExit(d_ref_itr2.UnbindTexture());
+
+    if (h_data)
+    {
+        delete[] h_data;
+    }
+    if (d_data)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_data));
+    }
+    if (d_dummy)
+    {
+        CubDebugExit(g_allocator.DeviceFree(d_dummy));
+    }
+}
+
+/**
+ * Run non-integer tests
+ */
+template <typename T, typename CastT>
+void Test()
+{
+    TestTexRef<T, CastT>();
+}
+
+/**
+ * Run tests
+ */
+template <typename T>
+void Test()
+{
+    // Test non-const type
+    Test<T, T>();
+
+    // Test non-const type
+    Test<T, const T>();
+}
+
+
+/**
+ * Main
+ */
+int main(int argc, char** argv)
+{
+    // Initialize command line
+    CommandLineArgs args(argc, argv);
+    g_verbose = args.CheckCmdLineFlag("v");
+
+    // Print usage
+    if (args.CheckCmdLineFlag("help"))
+    {
+        printf("%s "
+            "[--device=<device-id>] "
+            "[--v] "
+            "\n", argv[0]);
+        exit(0);
+    }
+
+    // Initialize device
+    CubDebugExit(args.DeviceInit());
+
+    // Evaluate different data types
+    Test<char>();
+    Test<short>();
+    Test<int>();
+    Test<long>();
+    Test<long long>();
+    Test<float>();
+    Test<double>();
+
+    Test<char2>();
+    Test<short2>();
+    Test<int2>();
+    Test<long2>();
+    Test<longlong2>();
+    Test<float2>();
+    Test<double2>();
+
+    Test<char3>();
+    Test<short3>();
+    Test<int3>();
+    Test<long3>();
+    Test<longlong3>();
+    Test<float3>();
+    Test<double3>();
+
+    Test<char4>();
+    Test<short4>();
+    Test<int4>();
+    Test<long4>();
+    Test<longlong4>();
+    Test<float4>();
+    Test<double4>();
+
+    Test<TestFoo>();
+    Test<TestBar>();
+
+    printf("\nTest complete\n");
+    fflush(stdout);
+
+    return 0;
+}