Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[Opt] Optimize insert_and_evict API using pipeline and TLP. #157

Merged
merged 1 commit into from
Aug 7, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
64 changes: 32 additions & 32 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,79 +128,79 @@ Your environment must meet the following requirements:
* `find_or_insert*` means the `find_or_insert` API that directly returns the addresses of values.
* ***Throughput Unit: Billion-KV/second***

### On pure HBM mode:
### On pure HBM mode:

* dim = 8, capacity = 128 Million-KV, HBM = 4 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 1.161 | 2.813 | 1.708 | 1.942 | 4.147 | 1.801 | 1.023 |
| 0.75 | 1.004 | 2.785 | 0.669 | 0.865 | 1.939 | 1.302 | 0.873 |
| 1.00 | 0.364 | 2.801 | 0.370 | 0.499 | 0.930 | 0.392 | 0.315 |
| 0.50 | 1.151 | 2.535 | 1.691 | 1.927 | 4.127 | 1.799 | 1.071 |
| 0.75 | 0.999 | 2.538 | 0.668 | 0.864 | 1.927 | 1.293 | 1.026 |
| 1.00 | 0.364 | 2.559 | 0.370 | 0.498 | 0.929 | 0.392 | 0.505 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 2.213 | 19.096 |
| 0.75 | 2.165 | 19.825 |
| 1.00 | 2.067 | 2.818 |
| 0.50 | 2.186 | 18.059 |
| 0.75 | 2.163 | 16.762 |
| 1.00 | 2.059 | 2.758 |

* dim = 32, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 1.071 | 2.490 | 1.264 | 1.602 | 4.136 | 1.801 | 0.949 |
| 0.75 | 0.866 | 2.462 | 0.637 | 0.853 | 1.939 | 1.302 | 0.740 |
| 1.00 | 0.359 | 2.573 | 0.348 | 0.492 | 0.925 | 0.377 | 0.278 |
| 0.50 | 1.073 | 2.341 | 1.265 | 1.594 | 4.121 | 1.795 | 0.931 |
| 0.75 | 0.862 | 2.295 | 0.639 | 0.852 | 1.925 | 1.292 | 0.874 |
| 1.00 | 0.359 | 2.345 | 0.345 | 0.492 | 0.926 | 0.375 | 0.467 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.709 | 15.918 |
| 0.75 | 0.575 | 14.923 |
| 1.00 | 0.567 | 0.758 |
| 0.50 | 0.698 | 14.380 |
| 0.75 | 0.574 | 13.489 |
| 1.00 | 0.563 | 0.761 |

* dim = 64, capacity = 64 Million-KV, HBM = 16 GB, HMEM = 0 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* | insert_and_evict |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|-----------------:|
| 0.50 | 0.868 | 2.122 | 0.916 | 1.116 | 4.451 | 1.838 | 0.773 |
| 0.75 | 0.670 | 2.112 | 0.570 | 0.790 | 1.984 | 1.289 | 0.587 |
| 1.00 | 0.333 | 2.162 | 0.335 | 0.467 | 0.940 | 0.393 | 0.240 |
| 0.50 | 0.864 | 2.040 | 0.921 | 1.112 | 4.395 | 1.825 | 0.806 |
| 0.75 | 0.668 | 2.011 | 0.571 | 0.789 | 1.974 | 1.295 | 0.764 |
| 1.00 | 0.333 | 2.050 | 0.334 | 0.469 | 0.938 | 0.392 | 0.481 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.321 | 11.050 |
| 0.75 | 0.301 | 10.965 |
| 1.00 | 0.292 | 0.390 |
| 0.50 | 0.319 | 10.553 |
| 0.75 | 0.298 | 10.400 |
| 1.00 | 0.293 | 0.390 |

### On HBM+HMEM hybrid mode:
### On HBM+HMEM hybrid mode:

* dim = 64, capacity = 128 Million-KV, HBM = 16 GB, HMEM = 16 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 | 0.118 | 0.145 | 0.119 | 0.147 | 4.097 | 1.807 |
| 0.75 | 0.116 | 0.144 | 0.115 | 0.142 | 1.932 | 1.300 |
| 1.00 | 0.091 | 0.126 | 0.092 | 0.114 | 0.927 | 0.379 |
| 0.50 | 0.083 | 0.124 | 0.116 | 0.132 | 4.032 | 1.791 |
| 0.75 | 0.082 | 0.123 | 0.114 | 0.129 | 1.906 | 1.131 |
| 1.00 | 0.069 | 0.110 | 0.087 | 0.105 | 0.926 | 0.392 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.322 | 11.206 |
| 0.75 | 0.300 | 11.072 |
| 1.00 | 0.292 | 0.388 |
| 0.50 | 0.319 | 10.789 |
| 0.75 | 0.299 | 10.222 |
| 1.00 | 0.294 | 0.389 |

* dim = 64, capacity = 512 Million-KV, HBM = 32 GB, HMEM = 96 GB

| λ | insert_or_assign | find | find_or_insert | assign | find* | find_or_insert* |
|-----:|-----------------:|-------:|---------------:|-------:|-------:|----------------:|
| 0.50 | 0.049 | 0.072 | 0.047 | 0.068 | 3.559 | 1.718 |
| 0.75 | 0.048 | 0.072 | 0.048 | 0.069 | 1.854 | 1.251 |
| 1.00 | 0.044 | 0.067 | 0.044 | 0.061 | 0.912 | 0.360 |
| 0.50 | 0.049 | 0.073 | 0.049 | 0.070 | 3.535 | 1.718 |
| 0.75 | 0.049 | 0.072 | 0.048 | 0.069 | 1.850 | 1.247 |
| 1.00 | 0.044 | 0.068 | 0.044 | 0.061 | 0.911 | 0.390 |

| λ | export_batch | export_batch_if |
|-----:|-------------:|----------------:|
| 0.50 | 0.319 | 10.175 |
| 0.75 | 0.298 | 11.606 |
| 1.00 | 0.292 | 0.388 |
| 0.50 | 0.318 | 10.987 |
| 0.75 | 0.298 | 11.213 |
| 1.00 | 0.294 | 0.388 |

### Support and Feedback:

Expand Down
112 changes: 88 additions & 24 deletions include/merlin/core_kernels/kernel_utils.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,41 +32,68 @@ namespace cg = cooperative_groups;
namespace nv {
namespace merlin {

// if i % 2 == 0, select buffer 0, else buffer 1
// Vector Type of digests for memory access.
using VecD_Load = byte16;
// Vector Type of digests for computation.
using VecD_Comp = byte4;

// Select from double buffer.
// If i % 2 == 0, select buffer 0, else buffer 1.
__forceinline__ __device__ int same_buf(int i) { return (i & 0x01) ^ 0; }
// if i % 2 == 0, select buffer 1, else buffer 0
// If i % 2 == 0, select buffer 1, else buffer 0.
__forceinline__ __device__ int diff_buf(int i) { return (i & 0x01) ^ 1; }

template <typename K>
__forceinline__ __device__ uint8_t empty_digest() {
__forceinline__ __device__ D empty_digest() {
const K hashed_key = Murmur3HashDevice(static_cast<K>(EMPTY_KEY));
return static_cast<uint8_t>(hashed_key >> 32);
return static_cast<D>(hashed_key >> 32);
}

template <typename K>
__forceinline__ __device__ uint8_t get_digest(const K& key) {
__forceinline__ __device__ D get_digest(const K& key) {
const K hashed_key = Murmur3HashDevice(key);
return static_cast<uint8_t>(hashed_key >> 32);
return static_cast<D>(hashed_key >> 32);
}

// Get vector of digests for computation.
template <typename K>
__forceinline__ __device__ VecD_Comp digests_from_hashed(const K& hashed_key) {
D digest = static_cast<D>(hashed_key >> 32);
// Set every byte in VecD_Comp to `digest`.
return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

template <typename K>
__forceinline__ __device__ VecD_Comp empty_digests() {
D digest = empty_digest<K>();
// Set every byte in VecD_Comp to `digest`.
return static_cast<VecD_Comp>(__byte_perm(digest, digest, 0x0000));
}

// Position alignment.
template <uint32_t ALIGN_SIZE>
__forceinline__ __device__ uint32_t align_to(uint32_t& pos) {
constexpr uint32_t MASK = 0xffffffffU - (ALIGN_SIZE - 1);
return pos & MASK;
}

template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
const ElementType* src);

template <>
__forceinline__ __device__ void LDGSTS<uint8_t>(uint8_t* dst,
const uint8_t* src) {
uint8_t element = *src;
__forceinline__ __device__ void LDGSTS<byte>(byte* dst, const byte* src) {
byte element = *src;
*dst = element;
}

template <>
__forceinline__ __device__ void LDGSTS<uint16_t>(uint16_t* dst,
const uint16_t* src) {
uint16_t element = *src;
__forceinline__ __device__ void LDGSTS<byte2>(byte2* dst, const byte2* src) {
byte2 element = *src;
*dst = element;
}

// Require compute ability >= 8.0
template <typename ElementType>
__forceinline__ __device__ void LDGSTS(ElementType* dst,
const ElementType* src) {
Expand Down Expand Up @@ -96,11 +123,11 @@ struct CopyScoreByPassCache {
__forceinline__ __device__ static S lgs(const S* src) { return src[0]; }

__forceinline__ __device__ static void stg(S* dst, const S score_) {
__stwt(dst, score_);
__stcs(dst, score_);
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueOneGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
Expand All @@ -113,52 +140,89 @@ struct CopyValueOneGroup {
int offset = rank;
if (offset < dim) {
VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueTwoGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
int offset = rank;
LDGSTS<VecV>(dst + offset, src + offset);
offset += GROUP_SIZE;
if (offset < dim) LDGSTS<VecV>(dst + offset, src + offset);
}

__forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
int offset = rank;
const VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
offset += GROUP_SIZE;
if (offset < dim) {
const VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}
};

template <typename VecV = float4, int GROUP_SIZE = 16>
template <typename VecV = byte16, int GROUP_SIZE = 16>
struct CopyValueMultipleGroup {
__forceinline__ __device__ static void ldg_sts(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
LDGSTS<VecV>(dst + offset, src + offset);
}
}

__forceinline__ __device__ static void lds_stg(int rank, VecV* dst,
const VecV* src, int dim) {
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
VecV vec_v = src[offset];
__stwt(dst + offset, vec_v);
__stcs(dst + offset, vec_v);
}
}

__forceinline__ __device__ static void ldg_stg(int rank, VecV* dst,
const VecV* src,
const int dim) {
for (int offset = rank; offset < dim; offset += GROUP_SIZE) {
VecV vec_v = __ldcs(src + offset);
__stcs(dst + offset, vec_v);
}
}
};

template <typename K, typename S>
__forceinline__ __device__ void evict_key_score(K* evicted_keys,
S* evicted_scores,
const uint32_t evict_idx,
const K& key, const S& score) {
// Cache with evict_first strategy.
__stcs(evicted_keys + evict_idx, key);
if (evicted_scores != nullptr) {
__stcs(evicted_scores + evict_idx, score);
}
}

template <typename K, typename V, typename S, typename BUCKET = Bucket<K, V, S>>
__forceinline__ __device__ void update_score_digest(
K* bucket_keys_ptr, const uint32_t bucket_capacity, const uint32_t key_pos,
const K& key, const S& score) {
S* dst_score_ptr = BUCKET::scores(bucket_keys_ptr, bucket_capacity, key_pos);
D* dst_digest_ptr =
BUCKET::digests(bucket_keys_ptr, bucket_capacity, key_pos);
// Cache in L2 cache, bypass L1 Cache.
__stcg(dst_digest_ptr, get_digest<K>(key));
__stcg(dst_score_ptr, score);
}

template <class K, class V, class S>
__forceinline__ __device__ void update_score(Bucket<K, V, S>* __restrict bucket,
const int key_pos,
Expand Down
12 changes: 8 additions & 4 deletions include/merlin/core_kernels/lookup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -108,8 +108,10 @@ __global__ void lookup_kernel_with_io_pipeline_v1(
__pipeline_memcpy_async(sm_probing_digests[0] + groupID * DIGEST_SPAN + rank,
digests_ptr + rank * 4, sizeof(uint32_t));
__pipeline_commit();
__pipeline_commit(); // padding
__pipeline_commit(); // padding
// Padding, meet the param of the first `__pipeline_wait_prior`
// in the first loop.
__pipeline_commit();
__pipeline_commit();

for (int i = 0; i < loop_num; i++) {
int key_idx_block = groupID * GROUP_SIZE + i;
Expand Down Expand Up @@ -369,8 +371,10 @@ __global__ void lookup_kernel_with_io_pipeline_v2(
sm_probing_digests[0] + groupID * DIGEST_SPAN + rank * 2,
digests_ptr + rank * 8, sizeof(uint2));
__pipeline_commit();
__pipeline_commit(); // padding
__pipeline_commit(); // padding
// Padding, meet the param of the first `__pipeline_wait_prior`
// in the first loop.
__pipeline_commit();
__pipeline_commit();

for (int i = 0; i < loop_num; i++) {
int key_idx_block = groupID * GROUP_SIZE + i;
Expand Down
Loading