[Opt] Optimize assign API using kernel level pipeline and TLP on HBM.

NVIDIA-Merlin · Aug 18, 2023 · 6a320cc · 6a320cc
1 parent 5419e6b
commit 6a320cc
Show file tree

Hide file tree

Showing 6 changed files with 911 additions and 45 deletions.
diff --git a/README.md b/README.md
@@ -134,73 +134,73 @@ Your environment must meet the following requirements:
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            1.151 |  2.535 |          1.691 |  1.927 |  4.127 |           1.799 |            1.071 |
-| 0.75 |            0.999 |  2.538 |          0.668 |  0.864 |  1.927 |           1.293 |            1.026 |
-| 1.00 |            0.364 |  2.559 |          0.370 |  0.498 |  0.929 |           0.392 |            0.505 |
+| 0.50 |            1.153 |  2.580 |          1.706 |  1.798 |  4.111 |           1.801 |            1.083 |
+| 0.75 |            1.010 |  2.561 |          0.683 |  1.863 |  1.939 |           1.305 |            0.994 |
+| 1.00 |            0.366 |  2.580 |          0.371 |  1.894 |  0.932 |           0.393 |            0.517 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        2.186 |          18.059 |
-| 0.75 |        2.163 |          16.762 |
-| 1.00 |        2.059 |           2.758 |
+| 0.50 |        2.180 |          18.766 |
+| 0.75 |        2.179 |          18.873 |
+| 1.00 |        2.081 |           2.813 |
 
 * dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            1.073 |  2.341 |          1.265 |  1.594 |  4.121 |           1.795 |            0.931 |
-| 0.75 |            0.862 |  2.295 |          0.639 |  0.852 |  1.925 |           1.292 |            0.874 |
-| 1.00 |            0.359 |  2.345 |          0.345 |  0.492 |  0.926 |           0.375 |            0.467 |
+| 0.50 |            1.068 |  2.325 |          1.277 |  1.759 |  4.113 |           1.807 |            0.932 |
+| 0.75 |            0.862 |  2.309 |          0.651 |  1.754 |  1.934 |           1.304 |            0.874 |
+| 1.00 |            0.360 |  2.349 |          0.346 |  1.825 |  0.926 |           0.373 |            0.470 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.698 |          14.380 |
-| 0.75 |        0.574 |          13.489 |
-| 1.00 |        0.563 |           0.761 |
+| 0.50 |        0.701 |          15.185 |
+| 0.75 |        0.578 |          13.840 |
+| 1.00 |        0.567 |           0.777 |
 
 * dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* | insert_and_evict |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
-| 0.50 |            0.864 |  2.040 |          0.921 |  1.112 |  4.395 |           1.825 |            0.806 |
-| 0.75 |            0.668 |  2.011 |          0.571 |  0.789 |  1.974 |           1.295 |            0.764 |
-| 1.00 |            0.333 |  2.050 |          0.334 |  0.469 |  0.938 |           0.392 |            0.481 |
+| 0.50 |            0.862 |  2.041 |          0.930 |  1.543 |  4.399 |           1.854 |            0.807 |
+| 0.75 |            0.662 |  2.022 |          0.579 |  1.541 |  1.983 |           1.299 |            0.771 |
+| 1.00 |            0.334 |  2.052 |          0.335 |  1.590 |  0.939 |           0.393 |            0.473 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.319 |          10.553 |
-| 0.75 |        0.298 |          10.400 |
-| 1.00 |        0.293 |           0.390 |
+| 0.50 |        0.321 |          10.791 |
+| 0.75 |        0.298 |          10.864 |
+| 1.00 |        0.293 |           0.393 |
 
 ### On HBM+HMEM hybrid mode: 
 
 * dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
-| 0.50 |            0.083 |  0.124 |          0.116 |  0.132 |  4.032 |           1.791 |
-| 0.75 |            0.082 |  0.123 |          0.114 |  0.129 |  1.906 |           1.131 |
-| 1.00 |            0.069 |  0.110 |          0.087 |  0.105 |  0.926 |           0.392 |
+| 0.50 |            0.083 |  0.123 |          0.115 |  0.131 |  4.015 |           1.804 |
+| 0.75 |            0.082 |  0.123 |          0.111 |  0.129 |  1.911 |           1.136 |
+| 1.00 |            0.069 |  0.110 |          0.085 |  0.106 |  0.929 |           0.393 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.319 |          10.789 |
-| 0.75 |        0.299 |          10.222 |
-| 1.00 |        0.294 |           0.389 |
+| 0.50 |        0.322 |          10.720 |
+| 0.75 |        0.299 |          10.587 |
+| 1.00 |        0.293 |           0.394 |
 
 * dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB
 
 |    λ | insert_or_assign |   find | find_or_insert | assign |  find* | find_or_insert* |
 |-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
-| 0.50 |            0.049 |  0.073 |          0.049 |  0.070 |  3.535 |           1.718 |
-| 0.75 |            0.049 |  0.072 |          0.048 |  0.069 |  1.850 |           1.247 |
-| 1.00 |            0.044 |  0.068 |          0.044 |  0.061 |  0.911 |           0.390 |
+| 0.50 |            0.049 |  0.073 |          0.048 |  0.070 |  3.530 |           1.730 |
+| 0.75 |            0.049 |  0.072 |          0.048 |  0.069 |  1.849 |           1.261 |
+| 1.00 |            0.044 |  0.068 |          0.044 |  0.062 |  0.911 |           0.393 |
 
 |    λ | export_batch | export_batch_if |
 |-----:|-------------:|----------------:|
-| 0.50 |        0.318 |          10.987 |
-| 0.75 |        0.298 |          11.213 |
-| 1.00 |        0.294 |           0.388 |
+| 0.50 |        0.321 |          11.148 |
+| 0.75 |        0.299 |          11.269 |
+| 1.00 |        0.293 |           0.396 |
 
 ### Support and Feedback:
 

diff --git a/include/merlin/core_kernels/kernel_utils.cuh b/include/merlin/core_kernels/kernel_utils.cuh
@@ -223,6 +223,16 @@ __forceinline__ __device__ void update_score_digest(
   __stcg(dst_score_ptr, score);
 }
 
+template <typename K, typename V, typename S, typename BUCKET = Bucket<K, V, S>>
+__forceinline__ __device__ void update_score(K* bucket_keys_ptr,
+                                             const uint32_t bucket_capacity,
+                                             const uint32_t key_pos,
+                                             const S& score) {
+  S* dst_score_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
+  // Cache in L2 cache, bypass L1 Cache.
+  __stcg(dst_score_ptr, score);
+}
+
 template <class K, class V, class S>
 __forceinline__ __device__ void update_score(Bucket<K, V, S>* __restrict bucket,
                                              const int key_pos,