Skip to content
This repository has been archived by the owner on Feb 23, 2024. It is now read-only.

Fused ops #1

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion build/googletest-src
Submodule googletest-src updated from df7fee to f5e592
7 changes: 7 additions & 0 deletions src/global.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,13 @@ enum class SourceT {
HOST,
};

enum class OperationT {
INSERT,
QUERY,
DELETE,
NOP,
};

#define WARP_MASK 0xFFFFFFFF
#define NODE_WIDTH 32
#define WARP_WIDTH 32
200 changes: 200 additions & 0 deletions src/map/GpuBTree.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,32 @@ class GpuBTreeMap {
SizeT& count,
SizeT& range_lenght,
cudaStream_t stream_id = 0);
cudaError_t concurrentOpsWithRangeQueries(uint32_t*& d_root,
KeyT*& d_keys,
ValueT*& d_values,
OperationT*& d_ops,
SizeT& num_keys,
KeyT*& range_queries_lower,
KeyT*& range_queries_upper,
ValueT*& d_range_results,
SizeT& num_range_queries,
SizeT& range_length,
KeyT*& delete_queries,
SizeT& num_delete_queries,
cudaStream_t stream_id = 0);
cudaError_t serialExecutionAllOps(uint32_t*& d_root,
KeyT*& d_keys,
ValueT*& d_values,
OperationT*& d_ops,
SizeT& num_keys,
KeyT*& range_queries_lower,
KeyT*& range_queries_upper,
ValueT*& d_range_results,
SizeT& num_range_queries,
SizeT& range_length,
KeyT*& delete_queries,
SizeT& num_delete_queries,
cudaStream_t stream_id = 0);
bool _handle_memory;

public:
Expand Down Expand Up @@ -238,5 +264,179 @@ class GpuBTreeMap {

return cudaSuccess;
}

cudaError_t concurrentOpsWithRangeQueries(
KeyT* keys,
ValueT* values,
OperationT* ops,
SizeT num_keys,
KeyT* range_queries_lower,
KeyT* range_queries_upper,
ValueT* range_results,
SizeT num_range_queries,
SizeT average_length,
KeyT* delete_queries,
SizeT num_delete_queries,
SourceT source = SourceT::DEVICE) {
KeyT* d_keys;
ValueT* d_values;
OperationT* d_ops;

KeyT* d_range_queries_lower;
KeyT* d_range_queries_upper;
ValueT* d_range_results;
auto total_range_length = num_range_queries * average_length * 2;

KeyT* d_delete_queries;
if (source == SourceT::HOST) {
// Search and insert
CHECK_ERROR(memoryUtil::deviceAlloc(d_keys, num_keys));
CHECK_ERROR(memoryUtil::deviceAlloc(d_values, num_keys));
CHECK_ERROR(memoryUtil::deviceAlloc(d_ops, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(keys, d_keys, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(values, d_values, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(ops, d_ops, num_keys));

// Range queries
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_queries_lower, num_range_queries));
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_queries_upper, num_range_queries));
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_results, total_range_length));
CHECK_ERROR(memoryUtil::cpyToDevice(range_queries_lower, d_range_queries_lower, num_range_queries));
CHECK_ERROR(memoryUtil::cpyToDevice(range_queries_upper, d_range_queries_upper, num_range_queries));

// Delete
CHECK_ERROR(memoryUtil::deviceAlloc(d_delete_queries, num_delete_queries));
CHECK_ERROR(memoryUtil::cpyToDevice(delete_queries, d_delete_queries, num_delete_queries));
} else {
d_keys = keys;
d_values = values;

d_range_queries_lower = range_queries_lower;
d_range_queries_upper = range_queries_upper;
d_range_results = range_results;

d_delete_queries = delete_queries;
}

CHECK_ERROR(concurrentOpsWithRangeQueries(
_d_root,
d_keys,
d_values,
d_ops,
num_keys,
d_range_queries_lower,
d_range_queries_upper,
d_range_results,
num_range_queries,
average_length,
d_delete_queries,
num_delete_queries));

if (source == SourceT::HOST) {
CHECK_ERROR(memoryUtil::cpyToHost(d_values, values, num_keys));
CHECK_ERROR(memoryUtil::deviceFree(d_keys));
CHECK_ERROR(memoryUtil::deviceFree(d_values));
CHECK_ERROR(memoryUtil::deviceFree(d_ops));

CHECK_ERROR(memoryUtil::cpyToHost(d_range_results, range_results, total_range_length));
CHECK_ERROR(memoryUtil::deviceFree(d_range_results));
CHECK_ERROR(memoryUtil::deviceFree(d_range_queries_lower));
CHECK_ERROR(memoryUtil::deviceFree(d_range_queries_upper));

CHECK_ERROR(memoryUtil::deviceFree(d_delete_queries));

}

return cudaSuccess;

}

cudaError_t serialExecutionAllOps(
KeyT* keys,
ValueT* values,
OperationT* ops,
SizeT num_keys,
KeyT* range_queries_lower,
KeyT* range_queries_upper,
ValueT* range_results,
SizeT num_range_queries,
SizeT average_length,
KeyT* delete_queries,
SizeT num_delete_queries,
SourceT source = SourceT::DEVICE) {

KeyT* d_keys;
ValueT* d_values;
OperationT* d_ops;

KeyT* d_range_queries_lower;
KeyT* d_range_queries_upper;
ValueT* d_range_results;
auto total_range_length = num_range_queries * average_length * 2;

KeyT* d_delete_queries;
if (source == SourceT::HOST) {
// Search and insert
CHECK_ERROR(memoryUtil::deviceAlloc(d_keys, num_keys));
CHECK_ERROR(memoryUtil::deviceAlloc(d_values, num_keys));
CHECK_ERROR(memoryUtil::deviceAlloc(d_ops, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(keys, d_keys, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(values, d_values, num_keys));
CHECK_ERROR(memoryUtil::cpyToDevice(ops, d_ops, num_keys));

// Range queries
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_queries_lower, num_range_queries));
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_queries_upper, num_range_queries));
CHECK_ERROR(memoryUtil::deviceAlloc(d_range_results, total_range_length));
CHECK_ERROR(memoryUtil::cpyToDevice(range_queries_lower, d_range_queries_lower, num_range_queries));
CHECK_ERROR(memoryUtil::cpyToDevice(range_queries_upper, d_range_queries_upper, num_range_queries));

// Delete
CHECK_ERROR(memoryUtil::deviceAlloc(d_delete_queries, num_delete_queries));
CHECK_ERROR(memoryUtil::cpyToDevice(delete_queries, d_delete_queries, num_delete_queries));
} else {
d_keys = keys;
d_values = values;

d_range_queries_lower = range_queries_lower;
d_range_queries_upper = range_queries_upper;
d_range_results = range_results;

d_delete_queries = delete_queries;
}

CHECK_ERROR(serialExecutionAllOps(
_d_root,
d_keys,
d_values,
d_ops,
num_keys,
d_range_queries_lower,
d_range_queries_upper,
d_range_results,
num_range_queries,
average_length,
d_delete_queries,
num_delete_queries));

if (source == SourceT::HOST) {
CHECK_ERROR(memoryUtil::cpyToHost(d_values, values, num_keys));
CHECK_ERROR(memoryUtil::deviceFree(d_keys));
CHECK_ERROR(memoryUtil::deviceFree(d_values));
CHECK_ERROR(memoryUtil::deviceFree(d_ops));

CHECK_ERROR(memoryUtil::cpyToHost(d_range_results, range_results, total_range_length));
CHECK_ERROR(memoryUtil::deviceFree(d_range_results));
CHECK_ERROR(memoryUtil::deviceFree(d_range_queries_lower));
CHECK_ERROR(memoryUtil::deviceFree(d_range_queries_upper));

CHECK_ERROR(memoryUtil::deviceFree(d_delete_queries));

}

return cudaSuccess;


}
};
}; // namespace GpuBTree
80 changes: 80 additions & 0 deletions src/map/host.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,4 +118,84 @@ cudaError_t GpuBTreeMap<KeyT, ValueT, SizeT, AllocatorT>::rangeQuery(

return cudaSuccess;
}

template<typename KeyT, typename ValueT, typename SizeT, typename AllocatorT>
cudaError_t GpuBTreeMap<KeyT, ValueT, SizeT, AllocatorT>::concurrentOpsWithRangeQueries(
uint32_t*& d_root,
KeyT*& d_keys,
ValueT*& d_values,
OperationT*& d_ops,
SizeT& num_keys,
KeyT*& range_queries_lower,
KeyT*& range_queries_upper,
ValueT*& d_range_results,
SizeT& num_range_queries,
SizeT& range_length,
KeyT*& delete_queries,
SizeT& num_delete_queries,
cudaStream_t stream_id) {
const uint32_t block_size = 512;
const uint32_t num_blocks = (num_keys + block_size - 1) / block_size;
const uint32_t shared_bytes = 0;
kernels::fused_ops_b_tree<<<num_blocks, block_size, shared_bytes, stream_id>>>(
d_root,
d_keys,
d_values,
d_ops,
num_keys,
range_queries_lower,
range_queries_upper,
d_range_results,
num_range_queries,
range_length,
delete_queries,
num_delete_queries,
_mem_allocator);

return cudaSuccess;
}

template<typename KeyT, typename ValueT, typename SizeT, typename AllocatorT>
cudaError_t GpuBTreeMap<KeyT, ValueT, SizeT, AllocatorT>::serialExecutionAllOps(
uint32_t*& d_root,
KeyT*& d_keys,
ValueT*& d_values,
OperationT*& d_ops,
SizeT& num_keys,
KeyT*& d_queries_lower,
KeyT*& d_queries_upper,
ValueT*& d_range_results,
SizeT& num_range_queries,
SizeT& range_length,
KeyT*& delete_queries,
SizeT& num_delete_queries,
cudaStream_t stream_id) {
const uint32_t block_size = 512;
const uint32_t num_blocks = (num_keys + block_size - 1) / block_size;
const uint32_t shared_bytes = 0;
kernels::concurrent_ops_b_tree<<<num_blocks, block_size, shared_bytes, stream_id>>>(
d_root,
d_keys,
d_values,
d_ops,
num_keys,
_mem_allocator);

kernels::range_b_tree<<<num_blocks, block_size, shared_bytes, stream_id>>>(
d_root,
d_queries_lower,
d_queries_upper,
d_range_results,
num_range_queries,
range_length,
_mem_allocator);

kernels::delete_b_tree<<<num_blocks, BLOCKSIZE_SEARCH_, shared_bytes, stream_id>>>(
d_root, delete_queries, num_delete_queries, _mem_allocator);

cudaDeviceSynchronize();

return cudaSuccess;
}

}; // namespace GpuBTree
Loading