NVIDIA-Merlin · rhdong · Aug 7, 2023 · Aug 3, 2023
diff --git a/README.md b/README.md
@@ -128,79 +128,79 @@ Your environment must meet the following requirements:
 * `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values.
 * ***Throughput Unit: Billion-KV/second***
 
-### On pure HBM mode:
+### On pure HBM mode: 
 
 * dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            1.161 |  2.813 |          1.708 |  1.942 |  4.147 |           1.801 |            1.023 |
-| 0.75 |            1.004 |  2.785 |          0.669 |  0.865 |  1.939 |           1.302 |            0.873 |
-| 1.00 |            0.364 |  2.801 |          0.370 |  0.499 |  0.930 |           0.392 |            0.315 |
+| 0.50 |            1.151 |  2.535 |          1.691 |  1.927 |  4.127 |           1.799 |            1.071 |
+| 0.75 |            0.999 |  2.538 |          0.668 |  0.864 |  1.927 |           1.293 |            1.026 |
+| 1.00 |            0.364 |  2.559 |          0.370 |  0.498 |  0.929 |           0.392 |            0.505 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        2.213 |          19.096 |
-| 0.75 |        2.165 |          19.825 |
-| 1.00 |        2.067 |           2.818 |
+| 0.50 |        2.186 |          18.059 |
+| 0.75 |        2.163 |          16.762 |
+| 1.00 |        2.059 |           2.758 |
 
 * dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            1.071 |  2.490 |          1.264 |  1.602 |  4.136 |           1.801 |            0.949 |
-| 0.75 |            0.866 |  2.462 |          0.637 |  0.853 |  1.939 |           1.302 |            0.740 |
-| 1.00 |            0.359 |  2.573 |          0.348 |  0.492 |  0.925 |           0.377 |            0.278 |
+| 0.50 |            1.073 |  2.341 |          1.265 |  1.594 |  4.121 |           1.795 |            0.931 |
+| 0.75 |            0.862 |  2.295 |          0.639 |  0.852 |  1.925 |           1.292 |            0.874 |
+| 1.00 |            0.359 |  2.345 |          0.345 |  0.492 |  0.926 |           0.375 |            0.467 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.709 |          15.918 |
-| 0.75 |        0.575 |          14.923 |
-| 1.00 |        0.567 |           0.758 |
+| 0.50 |        0.698 |          14.380 |
+| 0.75 |        0.574 |          13.489 |
+| 1.00 |        0.563 |           0.761 |
 
 * dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            0.868 |  2.122 |          0.916 |  1.116 |  4.451 |           1.838 |            0.773 |
-| 0.75 |            0.670 |  2.112 |          0.570 |  0.790 |  1.984 |           1.289 |            0.587 |
-| 1.00 |            0.333 |  2.162 |          0.335 |  0.467 |  0.940 |           0.393 |            0.240 |
+| 0.50 |            0.864 |  2.040 |          0.921 |  1.112 |  4.395 |           1.825 |            0.806 |
+| 0.75 |            0.668 |  2.011 |          0.571 |  0.789 |  1.974 |           1.295 |            0.764 |
+| 1.00 |            0.333 |  2.050 |          0.334 |  0.469 |  0.938 |           0.392 |            0.481 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.321 |          11.050 |
-| 0.75 |        0.301 |          10.965 |
-| 1.00 |        0.292 |           0.390 |
+| 0.50 |        0.319 |          10.553 |
+| 0.75 |        0.298 |          10.400 |
+| 1.00 |        0.293 |           0.390 |
 
-### On HBM+HMEM hybrid mode:
+### On HBM+HMEM hybrid mode: 
 
 * dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
-| 0.50 |            0.118 |  0.145 |          0.119 |  0.147 |  4.097 |           1.807 |
-| 0.75 |            0.116 |  0.144 |          0.115 |  0.142 |  1.932 |           1.300 |
-| 1.00 |            0.091 |  0.126 |          0.092 |  0.114 |  0.927 |           0.379 |
+| 0.50 |            0.083 |  0.124 |          0.116 |  0.132 |  4.032 |           1.791 |
+| 0.75 |            0.082 |  0.123 |          0.114 |  0.129 |  1.906 |           1.131 |
+| 1.00 |            0.069 |  0.110 |          0.087 |  0.105 |  0.926 |           0.392 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.322 |          11.206 |
-| 0.75 |        0.300 |          11.072 |
-| 1.00 |        0.292 |           0.388 |
+| 0.50 |        0.319 |          10.789 |
+| 0.75 |        0.299 |          10.222 |
+| 1.00 |        0.294 |           0.389 |
 
 * dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
-| 0.50 |            0.049 |  0.072 |          0.047 |  0.068 |  3.559 |           1.718 |
-| 0.75 |            0.048 |  0.072 |          0.048 |  0.069 |  1.854 |           1.251 |
-| 1.00 |            0.044 |  0.067 |          0.044 |  0.061 |  0.912 |           0.360 |
+| 0.50 |            0.049 |  0.073 |          0.049 |  0.070 |  3.535 |           1.718 |
+| 0.75 |            0.049 |  0.072 |          0.048 |  0.069 |  1.850 |           1.247 |
+| 1.00 |            0.044 |  0.068 |          0.044 |  0.061 |  0.911 |           0.390 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.319 |          10.175 |
-| 0.75 |        0.298 |          11.606 |
-| 1.00 |        0.292 |           0.388 |
+| 0.50 |        0.318 |          10.987 |
+| 0.75 |        0.298 |          11.213 |
+| 1.00 |        0.294 |           0.388 |
 
 ### Support and Feedback:
 

diff --git a/include/merlin/core_kernels/kernel_utils.cuh b/include/merlin/core_kernels/kernel_utils.cuh
@@ -32,41 +32,68 @@ namespace cg = cooperative_groups;
 namespace nv {
 namespace merlin {
 
-// if i % 2 == 0, select buffer 0, else buffer 1
+// Vector Type of digests for memory access.
+using VecD_Load = byte16;
+// Vector Type of digests for computation.
+using VecD_Comp = byte4;
+
+// Select from double buffer.
+// If i % 2 == 0, select buffer 0, else buffer 1.
 __forceinline__ __device__ int same_buf(int i) { return (i & 0x01) ^ 0; }
-// if i % 2 == 0, select buffer 1, else buffer 0
+// If i % 2 == 0, select buffer 1, else buffer 0.
 __forceinline__ __device__ int diff_buf(int i) { return (i & 0x01) ^ 1; }
 
 template <typename K>
-__forceinline__ __device__ uint8_t empty_digest() {
+__forceinline__ __device__ D empty_digest() {
   const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));
-  return static_cast<uint8_t>(hashed_key >> 32);
+  return static_cast<D>(hashed_key >> 32);
 }
 
 template <typename K>
-__forceinline__ __device__ uint8_t get_digest(const K& key) {
+__forceinline__ __device__ D get_digest(const K& key) {
   const K hashed_key = Murmur3HashDevice(key);
-  return static_cast<uint8_t>(hashed_key >> 32);
+  return static_cast<D>(hashed_key >> 32);
+}
+
+// Get vector of digests for computation.
+template <typename K>
+__forceinline__ __device__ VecD_Comp digests_from_hashed(const K& hashed_key) {
+  D digest = static_cast<D>(hashed_key >> 32);
+  // Set every byte in VecD_Comp to `digest`.
+  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
+}
+
+template <typename K>
+__forceinline__ __device__ VecD_Comp empty_digests() {
+  D digest = empty_digest<K>();
+  // Set every byte in VecD_Comp to `digest`.
+  return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
+}
+
+// Position alignment.
+template <uint32_t ALIGN_SIZE>
+__forceinline__ __device__ uint32_t align_to(uint32_t& pos) {
+  constexpr uint32_t MASK = 0xffffffffU - (ALIGN_SIZE - 1);
+  return pos & MASK;
 }
 
 template <typename ElementType>
 __forceinline__ __device__ void LDGSTS(ElementType* dst,
                                        const ElementType* src);
 
 template <>
-__forceinline__ __device__ void LDGSTS<uint8_t>(uint8_t* dst,
-                                                const uint8_t* src) {
-  uint8_t element = *src;
+__forceinline__ __device__ void LDGSTS<byte>(byte* dst, const byte* src) {
+  byte element = *src;
   *dst = element;
 }
 
 template <>
-__forceinline__ __device__ void LDGSTS<uint16_t>(uint16_t* dst,
-                                                 const uint16_t* src) {
-  uint16_t element = *src;
+__forceinline__ __device__ void LDGSTS<byte2>(byte2* dst, const byte2* src) {
+  byte2 element = *src;
   *dst = element;
 }
 
+// Require compute ability >= 8.0
 template <typename ElementType>
 __forceinline__ __device__ void LDGSTS(ElementType* dst,
                                        const ElementType* src) {
@@ -96,11 +123,11 @@ struct CopyScoreByPassCache {
   __forceinline__ __device__ static S lgs(const S* src) { return src[0]; }
 
   __forceinline__ __device__ static void stg(S* dst, const S score_) {
-    __stwt(dst, score_);
+    __stcs(dst, score_);
   }
 };
 
-template <typename VecV = float4, int GROUP_SIZE = 16>
+template <typename VecV = byte16, int GROUP_SIZE = 16>
 struct CopyValueOneGroup {
   __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
                                                  const VecV* src, int dim) {
@@ -113,52 +140,89 @@ struct CopyValueOneGroup {
     int offset = rank;
     if (offset < dim) {
       VecV vec_v = src[offset];
-      __stwt(dst + offset, vec_v);
+      __stcs(dst + offset, vec_v);
     }
   }
 };
 
-template <typename VecV = float4, int GROUP_SIZE = 16>
+template <typename VecV = byte16, int GROUP_SIZE = 16>
 struct CopyValueTwoGroup {
   __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
-                                                 const VecV* src, int dim) {
+                                                 const VecV* src,
+                                                 const int dim) {
     int offset = rank;
     LDGSTS<VecV>(dst + offset, src + offset);
     offset += GROUP_SIZE;
     if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);
   }
 
   __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
-                                                 const VecV* src, int dim) {
+                                                 const VecV* src,
+                                                 const int dim) {
     int offset = rank;
     const VecV vec_v = src[offset];
-    __stwt(dst + offset, vec_v);
+    __stcs(dst + offset, vec_v);
     offset += GROUP_SIZE;
     if (offset < dim) {
       const VecV vec_v = src[offset];
-      __stwt(dst + offset, vec_v);
+      __stcs(dst + offset, vec_v);
     }
   }
 };
 
-template <typename VecV = float4, int GROUP_SIZE = 16>
+template <typename VecV = byte16, int GROUP_SIZE = 16>
 struct CopyValueMultipleGroup {
   __forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
-                                                 const VecV* src, int dim) {
+                                                 const VecV* src,
+                                                 const int dim) {
     for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
       LDGSTS<VecV>(dst + offset, src + offset);
     }
   }
 
   __forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
-                                                 const VecV* src, int dim) {
+                                                 const VecV* src,
+                                                 const int dim) {
     for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
       VecV vec_v = src[offset];
-      __stwt(dst + offset, vec_v);
+      __stcs(dst + offset, vec_v);
+    }
+  }
+
+  __forceinline__ __device__ static void ldg_stg(int rank, VecV* dst,
+                                                 const VecV* src,
+                                                 const int dim) {
+    for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
+      VecV vec_v = __ldcs(src + offset);
+      __stcs(dst + offset, vec_v);
     }
   }
 };
 
+template <typename K, typename S>
+__forceinline__ __device__ void evict_key_score(K* evicted_keys,
+                                                S* evicted_scores,
+                                                const uint32_t evict_idx,
+                                                const K& key, const S& score) {
+  // Cache with evict_first strategy.
+  __stcs(evicted_keys + evict_idx, key);
+  if (evicted_scores != nullptr) {
+    __stcs(evicted_scores + evict_idx, score);
+  }
+}
+
+template <typename K, typename V, typename S, typename BUCKET = Bucket<K, V, S>>
+__forceinline__ __device__ void update_score_digest(
+    K* bucket_keys_ptr, const uint32_t bucket_capacity, const uint32_t key_pos,
+    const K& key, const S& score) {
+  S* dst_score_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
+  D* dst_digest_ptr =
+      BUCKET::digests(bucket_keys_ptr, bucket_capacity, key_pos);
+  // Cache in L2 cache, bypass L1 Cache.
+  __stcg(dst_digest_ptr, get_digest<K>(key));
+  __stcg(dst_score_ptr, score);
+}
+
 template <class K, class V, class S>
 __forceinline__ __device__ void update_score(Bucket<K, V, S>* __restrict bucket,
                                              const int key_pos,

diff --git a/include/merlin/core_kernels/lookup.cuh b/include/merlin/core_kernels/lookup.cuh
@@ -108,8 +108,10 @@ __global__ void lookup_kernel_with_io_pipeline_v1(
   __pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
                           digests_ptr + rank * 4, sizeof(uint32_t));
   __pipeline_commit();
-  __pipeline_commit();  // padding
-  __pipeline_commit();  // padding
+  // Padding, meet the param of the first `__pipeline_wait_prior`
+  // in the first loop.
+  __pipeline_commit();
+  __pipeline_commit();
 
   for (int i = 0; i < loop_num; i++) {
     int key_idx_block = groupID * GROUP_SIZE + i;
@@ -369,8 +371,10 @@ __global__ void lookup_kernel_with_io_pipeline_v2(
       sm_probing_digests[0] + groupID * DIGEST_SPAN + rank * 2,
       digests_ptr + rank * 8, sizeof(uint2));
   __pipeline_commit();
-  __pipeline_commit();  // padding
-  __pipeline_commit();  // padding
+  // Padding, meet the param of the first `__pipeline_wait_prior`
+  // in the first loop.
+  __pipeline_commit();
+  __pipeline_commit();
 
   for (int i = 0; i < loop_num; i++) {
     int key_idx_block = groupID * GROUP_SIZE + i;