Skip to content

Commit

Permalink
[Opt] Optimize insert_and_evict API using kernel level pipeline and t…
Browse files Browse the repository at this point in the history
…hread level parallellism.
  • Loading branch information
jiashuy authored and rhdong committed Aug 7, 2023
1 parent 0da72d4 commit 141d9b6
Show file tree
Hide file tree
Showing 6 changed files with 1,520 additions and 94 deletions.
64 changes: 32 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,79 +128,79 @@ Your environment must meet the following requirements:
* `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values.
* ***Throughput Unit: Billion-KV/second***

### On pure HBM mode:
### On pure HBM mode:

* dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 1.161 | 2.813 | 1.708 | 1.942 | 4.147 | 1.801 | 1.023 |
| 0.75 | 1.004 | 2.785 | 0.669 | 0.865 | 1.939 | 1.302 | 0.873 |
| 1.00 | 0.364 | 2.801 | 0.370 | 0.499 | 0.930 | 0.392 | 0.315 |
| 0.50 | 1.151 | 2.535 | 1.691 | 1.927 | 4.127 | 1.799 | 1.071 |
| 0.75 | 0.999 | 2.538 | 0.668 | 0.864 | 1.927 | 1.293 | 1.026 |
| 1.00 | 0.364 | 2.559 | 0.370 | 0.498 | 0.929 | 0.392 | 0.505 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 2.213 | 19.096 |
| 0.75 | 2.165 | 19.825 |
| 1.00 | 2.067 | 2.818 |
| 0.50 | 2.186 | 18.059 |
| 0.75 | 2.163 | 16.762 |
| 1.00 | 2.059 | 2.758 |

* dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 1.071 | 2.490 | 1.264 | 1.602 | 4.136 | 1.801 | 0.949 |
| 0.75 | 0.866 | 2.462 | 0.637 | 0.853 | 1.939 | 1.302 | 0.740 |
| 1.00 | 0.359 | 2.573 | 0.348 | 0.492 | 0.925 | 0.377 | 0.278 |
| 0.50 | 1.073 | 2.341 | 1.265 | 1.594 | 4.121 | 1.795 | 0.931 |
| 0.75 | 0.862 | 2.295 | 0.639 | 0.852 | 1.925 | 1.292 | 0.874 |
| 1.00 | 0.359 | 2.345 | 0.345 | 0.492 | 0.926 | 0.375 | 0.467 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.709 | 15.918 |
| 0.75 | 0.575 | 14.923 |
| 1.00 | 0.567 | 0.758 |
| 0.50 | 0.698 | 14.380 |
| 0.75 | 0.574 | 13.489 |
| 1.00 | 0.563 | 0.761 |

* dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 0.868 | 2.122 | 0.916 | 1.116 | 4.451 | 1.838 | 0.773 |
| 0.75 | 0.670 | 2.112 | 0.570 | 0.790 | 1.984 | 1.289 | 0.587 |
| 1.00 | 0.333 | 2.162 | 0.335 | 0.467 | 0.940 | 0.393 | 0.240 |
| 0.50 | 0.864 | 2.040 | 0.921 | 1.112 | 4.395 | 1.825 | 0.806 |
| 0.75 | 0.668 | 2.011 | 0.571 | 0.789 | 1.974 | 1.295 | 0.764 |
| 1.00 | 0.333 | 2.050 | 0.334 | 0.469 | 0.938 | 0.392 | 0.481 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.321 | 11.050 |
| 0.75 | 0.301 | 10.965 |
| 1.00 | 0.292 | 0.390 |
| 0.50 | 0.319 | 10.553 |
| 0.75 | 0.298 | 10.400 |
| 1.00 | 0.293 | 0.390 |

### On HBM+HMEM hybrid mode:
### On HBM+HMEM hybrid mode:

* dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 | 0.118 | 0.145 | 0.119 | 0.147 | 4.097 | 1.807 |
| 0.75 | 0.116 | 0.144 | 0.115 | 0.142 | 1.932 | 1.300 |
| 1.00 | 0.091 | 0.126 | 0.092 | 0.114 | 0.927 | 0.379 |
| 0.50 | 0.083 | 0.124 | 0.116 | 0.132 | 4.032 | 1.791 |
| 0.75 | 0.082 | 0.123 | 0.114 | 0.129 | 1.906 | 1.131 |
| 1.00 | 0.069 | 0.110 | 0.087 | 0.105 | 0.926 | 0.392 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.322 | 11.206 |
| 0.75 | 0.300 | 11.072 |
| 1.00 | 0.292 | 0.388 |
| 0.50 | 0.319 | 10.789 |
| 0.75 | 0.299 | 10.222 |
| 1.00 | 0.294 | 0.389 |

* dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 | 0.049 | 0.072 | 0.047 | 0.068 | 3.559 | 1.718 |
| 0.75 | 0.048 | 0.072 | 0.048 | 0.069 | 1.854 | 1.251 |
| 1.00 | 0.044 | 0.067 | 0.044 | 0.061 | 0.912 | 0.360 |
| 0.50 | 0.049 | 0.073 | 0.049 | 0.070 | 3.535 | 1.718 |
| 0.75 | 0.049 | 0.072 | 0.048 | 0.069 | 1.850 | 1.247 |
| 1.00 | 0.044 | 0.068 | 0.044 | 0.061 | 0.911 | 0.390 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.319 | 10.175 |
| 0.75 | 0.298 | 11.606 |
| 1.00 | 0.292 | 0.388 |
| 0.50 | 0.318 | 10.987 |
| 0.75 | 0.298 | 11.213 |
| 1.00 | 0.294 | 0.388 |

### Support and Feedback:

Expand Down
112 changes: 88 additions & 24 deletions include/merlin/core_kernels/kernel_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,41 +32,68 @@ namespace cg = cooperative_groups;
namespace nv {
namespace merlin {

// if i % 2 == 0, select buffer 0, else buffer 1
// Vector Type of digests for memory access.
using VecD_Load = byte16;
// Vector Type of digests for computation.
using VecD_Comp = byte4;

// Select from double buffer.
// If i % 2 == 0, select buffer 0, else buffer 1.
__forceinline__ __device__ int same_buf(int i) { return (i & 0x01) ^ 0; }
// if i % 2 == 0, select buffer 1, else buffer 0
// If i % 2 == 0, select buffer 1, else buffer 0.
__forceinline__ __device__ int diff_buf(int i) { return (i & 0x01) ^ 1; }

template <typename K>
__forceinline__ __device__ uint8_t empty_digest() {
__forceinline__ __device__ D empty_digest() {
const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));
return static_cast<uint8_t>(hashed_key >> 32);
return static_cast<D>(hashed_key >> 32);
}

template <typename K>
__forceinline__ __device__ uint8_t get_digest(const K& key) {
__forceinline__ __device__ D get_digest(const K& key) {
const K hashed_key = Murmur3HashDevice(key);
return static_cast<uint8_t>(hashed_key >> 32);
return static_cast<D>(hashed_key >> 32);
}

// Get vector of digests for computation.
template <typename K>
__forceinline__ __device__ VecD_Comp digests_from_hashed(const K& hashed_key) {
D digest = static_cast<D>(hashed_key >> 32);
// Set every byte in VecD_Comp to `digest`.
return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

template <typename K>
__forceinline__ __device__ VecD_Comp empty_digests() {
D digest = empty_digest<K>();
// Set every byte in VecD_Comp to `digest`.
return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

// Position alignment.
template <uint32_t ALIGN_SIZE>
__forceinline__ __device__ uint32_t align_to(uint32_t& pos) {
constexpr uint32_t MASK = 0xffffffffU - (ALIGN_SIZE - 1);
return pos & MASK;
}

template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
const ElementType* src);

template <>
__forceinline__ __device__ void LDGSTS<uint8_t>(uint8_t* dst,
const uint8_t* src) {
uint8_t element = *src;
__forceinline__ __device__ void LDGSTS<byte>(byte* dst, const byte* src) {
byte element = *src;
*dst = element;
}

template <>
__forceinline__ __device__ void LDGSTS<uint16_t>(uint16_t* dst,
const uint16_t* src) {
uint16_t element = *src;
__forceinline__ __device__ void LDGSTS<byte2>(byte2* dst, const byte2* src) {
byte2 element = *src;
*dst = element;
}

// Require compute ability >= 8.0
template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
const ElementType* src) {
Expand Down Expand Up @@ -96,11 +123,11 @@ struct CopyScoreByPassCache {
__forceinline__ __device__ static S lgs(const S* src) { return src[0]; }

__forceinline__ __device__ static void stg(S* dst, const S score_) {
__stwt(dst, score_);
__stcs(dst, score_);
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueOneGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
Expand All @@ -113,52 +140,89 @@ struct CopyValueOneGroup {
int offset = rank;
if (offset < dim) {
VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueTwoGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
int offset = rank;
LDGSTS<VecV>(dst + offset, src + offset);
offset += GROUP_SIZE;
if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);
}

__forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
int offset = rank;
const VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
offset += GROUP_SIZE;
if (offset < dim) {
const VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueMultipleGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
LDGSTS<VecV>(dst + offset, src + offset);
}
}

__forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}

__forceinline__ __device__ static void ldg_stg(int rank, VecV* dst,
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
VecV vec_v = __ldcs(src + offset);
__stcs(dst + offset, vec_v);
}
}
};

template <typename K, typename S>
__forceinline__ __device__ void evict_key_score(K* evicted_keys,
S* evicted_scores,
const uint32_t evict_idx,
const K& key, const S& score) {
// Cache with evict_first strategy.
__stcs(evicted_keys + evict_idx, key);
if (evicted_scores != nullptr) {
__stcs(evicted_scores + evict_idx, score);
}
}

template <typename K, typename V, typename S, typename BUCKET = Bucket<K, V, S>>
__forceinline__ __device__ void update_score_digest(
K* bucket_keys_ptr, const uint32_t bucket_capacity, const uint32_t key_pos,
const K& key, const S& score) {
S* dst_score_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
D* dst_digest_ptr =
BUCKET::digests(bucket_keys_ptr, bucket_capacity, key_pos);
// Cache in L2 cache, bypass L1 Cache.
__stcg(dst_digest_ptr, get_digest<K>(key));
__stcg(dst_score_ptr, score);
}

template <class K, class V, class S>
__forceinline__ __device__ void update_score(Bucket<K, V, S>* __restrict bucket,
const int key_pos,
Expand Down
12 changes: 8 additions & 4 deletions include/merlin/core_kernels/lookup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ __global__ void lookup_kernel_with_io_pipeline_v1(
__pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
digests_ptr + rank * 4, sizeof(uint32_t));
__pipeline_commit();
__pipeline_commit(); // padding
__pipeline_commit(); // padding
// Padding, meet the param of the first `__pipeline_wait_prior`
// in the first loop.
__pipeline_commit();
__pipeline_commit();

for (int i = 0; i < loop_num; i++) {
int key_idx_block = groupID * GROUP_SIZE + i;
Expand Down Expand Up @@ -369,8 +371,10 @@ __global__ void lookup_kernel_with_io_pipeline_v2(
sm_probing_digests[0] + groupID * DIGEST_SPAN + rank * 2,
digests_ptr + rank * 8, sizeof(uint2));
__pipeline_commit();
__pipeline_commit(); // padding
__pipeline_commit(); // padding
// Padding, meet the param of the first `__pipeline_wait_prior`
// in the first loop.
__pipeline_commit();
__pipeline_commit();

for (int i = 0; i < loop_num; i++) {
int key_idx_block = groupID * GROUP_SIZE + i;
Expand Down
Loading

0 comments on commit 141d9b6

Please sign in to comment.