From abd56d1833a2382fc896d0c94a718b7df38d9d0b Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 20:12:57 -0400 Subject: [PATCH 01/12] Revert "varargs fixes" This reverts commit 85adef7fe2fbe2b415a7a04398dabbfe1579f01e. --- README.md | 2 +- examples/vector_sum_cpu.nim | 4 +- examples/vector_sum_cuda.nim | 4 +- examples/vector_sum_hip_amd.nim | 4 +- examples/vector_sum_hip_nvidia.nim | 4 +- src/cuda.nim | 3 + src/hip.nim | 20 +----- src/hippo.nim | 105 ++++++++++++++++++----------- tests/hip/call_params.nim | 4 +- tests/hip/dot.nim | 4 +- tests/hip/vector_sum.nim | 4 +- 11 files changed, 86 insertions(+), 72 deletions(-) diff --git a/README.md b/README.md index b58c32a..f791e0a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} = var c: int32 var dev_c: ptr[int32] handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint)) -hippoLaunchKernel(addKernel,args = (2,7,dev_c)) +handleError(launchKernel(addKernel,args = (2,7,dev_c))) handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost)) echo "2 + 7 = ", c handleError(hipFree(dev_c)) diff --git a/examples/vector_sum_cpu.nim b/examples/vector_sum_cpu.nim index 4d940c0..2b9cb24 100644 --- a/examples/vector_sum_cpu.nim +++ b/examples/vector_sum_cpu.nim @@ -34,11 +34,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - ) + )) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/examples/vector_sum_cuda.nim b/examples/vector_sum_cuda.nim index 01f2c45..5f0fa21 100644 --- a/examples/vector_sum_cuda.nim +++ b/examples/vector_sum_cuda.nim @@ -35,11 +35,11 @@ proc main() = handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - ) + )) # copy result back to host handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost)) diff --git a/examples/vector_sum_hip_amd.nim b/examples/vector_sum_hip_amd.nim index 8bdb762..3aba581 100644 --- a/examples/vector_sum_hip_amd.nim +++ b/examples/vector_sum_hip_amd.nim @@ -34,11 +34,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - ) + )) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/examples/vector_sum_hip_nvidia.nim b/examples/vector_sum_hip_nvidia.nim index 165119b..00102f0 100644 --- a/examples/vector_sum_hip_nvidia.nim +++ b/examples/vector_sum_hip_nvidia.nim @@ -37,11 +37,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - ) + )) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/src/cuda.nim b/src/cuda.nim index f4601ed..296f0ea 100644 --- a/src/cuda.nim +++ b/src/cuda.nim @@ -55,6 +55,9 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {. importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".} +# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3; +# args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {. +# importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".} proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".} proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} diff --git a/src/hip.nim b/src/hip.nim index ac15624..942a412 100644 --- a/src/hip.nim +++ b/src/hip.nim @@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer): hipError_t {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} -proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; - args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. - importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} +# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3; +# args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. +# importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".} proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} @@ -68,20 +68,6 @@ proc hipLaunchKernelGGL*( ) {. importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.} -proc hipModuleLaunchKernel*( - function_address: pointer; - grid_dim_x: uint32_t; - grid_dim_y: uint32_t; - grid_dim_z: uint32_t; - block_dim_x: uint32_t; - block_dim_y: uint32_t; - block_dim_z: uint32_t; - sharedMemBytes: uint32_t; - stream: hipStream_t; - kernel_params: ptr pointer; - extra: ptr pointer; - ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".} - type ConstCString* {.importc: "const char*".} = object converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.} diff --git a/src/hippo.nim b/src/hippo.nim index 4339d85..280d0e7 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -134,63 +134,88 @@ proc `=destroy`*(mem: var GpuMemory) = # ------------------- # Kernel Execution -template hippoLaunchKernel*( - kernel: proc, ## The GPU kernel procedure to launch - gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block - blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block - sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate - stream: HippoStream = nil, ## Which device stream to run under (defaults to null) - args: tuple, ## Arguments to pass to the GPU kernel -) = - var result: HippoError - ## Launch a kernel on the GPU. - ## also checks if launchKernel() returns an error. - ## Important: this only checks if the kernel launch was successful, not the kernel itself. - # - # This code is kinda gross, the launch kernel functions have a lot of different signatures. - var kernelArgs: seq[pointer] - for key, arg in args.fieldPairs: - let a1 = arg - kernelArgs.add(cast[pointer](addr a1)) - when HippoRuntime == "HIP": +proc launchKernel*( + kernel: proc, + gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block + blockDim: Dim3 = newDim3(1,1,1), # default to 1 thread per block + sharedMemBytes: uint32 = 0, + stream: HippoStream = nil, + args: tuple +): HippoError = + # launchKernel is designed to be similar to `kernel`<<>>(args) + + # this function is horrible but it works + # needs to be refactored to handle all the different runtimes and arguments better + + # having some issues between hip and hip-cpu, so defining different versions of launchKernel + when HippoRuntime == "HIP" and HipPlatform == "amd": + # This branch works for all args + echo "executing HIP" + var kernelArgs: seq[pointer] + for key, arg in args.fieldPairs: + kernelArgs.add(cast[pointer](addr arg)) result = hipLaunchKernel( cast[pointer](kernel), gridDim, blockDim, cast[ptr pointer](addr kernelArgs[0]), - sharedMemBytes, - stream ) + elif HippoRuntime == "HIP" and HipPlatform == "nvidia": + # TODO fix args on this branch + hipLaunchKernelGGL( + kernel, + gridDim, + blockDim, + 0, # TODO + nil, # TODO + # TODO handle args properly + cast[ptr[cint]](args[0]), + cast[ptr[cint]](args[1]), + cast[ptr[cint]](args[2]) + ) + result = hipGetLastError() elif HippoRuntime == "HIP_CPU": - # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim - # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface. - result = hipModuleLaunchKernel( - cast[pointer](kernel), - gridDim.x, - gridDim.y, - gridDim.z, - blockDim.x, - blockDim.y, - blockDim.z, - sharedMemBytes, - stream, - cast[ptr pointer](addr kernelArgs[0]), - nil + # TODO fix args on this branch + echo "executing kernel on CPU" + hipLaunchKernelGGL( + kernel, + gridDim, + blockDim, + 0, # TODO + nil, # TODO + # TODO handle args properly + args[0], + args[1], + args[2] ) + result = hipGetLastError() elif HippoRuntime == "CUDA": + # This branch works for all args + echo "executing CUDA" + var kernelArgs: seq[pointer] + for key, arg in args.fieldPairs: + kernelArgs.add(cast[pointer](addr arg)) result = cudaLaunchKernel( kernel, gridDim, blockDim, - cast[ptr pointer](addr kernelArgs[0]), - sharedMemBytes, - stream + cast[ptr pointer](addr kernelArgs[0]) + #sharedMemBytes, + #stream ) else: raise newException(Exception, &"Unknown runtime: {HippoRuntime}") - handleError(result) - +template hippoLaunchKernel*( + kernel: proc, ## The GPU kernel procedure to launch + gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block + blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block + sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate + stream: HippoStream = nil, ## Which device stream to run under (defaults to null) + args: tuple, ## Arguments to pass to the GPU kernel +) = + ## Launch a kernel on the GPU and check for errors + handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args)) # ------------------- diff --git a/tests/hip/call_params.nim b/tests/hip/call_params.nim index edb4f72..7013666 100644 --- a/tests/hip/call_params.nim +++ b/tests/hip/call_params.nim @@ -8,10 +8,10 @@ proc main() = var c: int32 var dev_c: ptr[int32] handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint)) - hippoLaunchKernel( + handleError(launchKernel( addKernel, args = (2,7,dev_c) - ) + )) handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost)) echo "2 + 7 = ", c handleError(hipFree(dev_c)) diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim index 6c97bb6..3b96f4a 100644 --- a/tests/hip/dot.nim +++ b/tests/hip/dot.nim @@ -62,12 +62,12 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( dot, gridDim = newDim3(BlocksPerGrid.uint32), blockDim = newDim3(ThreadsPerBlock.uint32), args = (dev_a, dev_b, dev_partial_c) - ) + )) # copy memory back from GPU to CPU handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost)) diff --git a/tests/hip/vector_sum.nim b/tests/hip/vector_sum.nim index 8fa62b9..9e7f983 100644 --- a/tests/hip/vector_sum.nim +++ b/tests/hip/vector_sum.nim @@ -29,11 +29,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - hippoLaunchKernel( + handleError(launchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - ) + )) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) From 2b0266a1103ab48354dbf519e1096a28b8ed6ae4 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 20:13:38 -0400 Subject: [PATCH 02/12] Reapply "varargs fixes" This reverts commit 77ab1372c6a3b8a89c3cedcde0bf441bd0ef51ba. --- README.md | 2 +- examples/vector_sum_cpu.nim | 4 +- examples/vector_sum_cuda.nim | 4 +- examples/vector_sum_hip_amd.nim | 4 +- examples/vector_sum_hip_nvidia.nim | 4 +- src/cuda.nim | 3 - src/hip.nim | 20 +++++- src/hippo.nim | 105 +++++++++++------------------ tests/hip/call_params.nim | 4 +- tests/hip/dot.nim | 4 +- tests/hip/vector_sum.nim | 4 +- 11 files changed, 72 insertions(+), 86 deletions(-) diff --git a/README.md b/README.md index f791e0a..b58c32a 100644 --- a/README.md +++ b/README.md @@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} = var c: int32 var dev_c: ptr[int32] handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint)) -handleError(launchKernel(addKernel,args = (2,7,dev_c))) +hippoLaunchKernel(addKernel,args = (2,7,dev_c)) handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost)) echo "2 + 7 = ", c handleError(hipFree(dev_c)) diff --git a/examples/vector_sum_cpu.nim b/examples/vector_sum_cpu.nim index 2b9cb24..4d940c0 100644 --- a/examples/vector_sum_cpu.nim +++ b/examples/vector_sum_cpu.nim @@ -34,11 +34,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - )) + ) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/examples/vector_sum_cuda.nim b/examples/vector_sum_cuda.nim index 5f0fa21..01f2c45 100644 --- a/examples/vector_sum_cuda.nim +++ b/examples/vector_sum_cuda.nim @@ -35,11 +35,11 @@ proc main() = handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - )) + ) # copy result back to host handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost)) diff --git a/examples/vector_sum_hip_amd.nim b/examples/vector_sum_hip_amd.nim index 3aba581..8bdb762 100644 --- a/examples/vector_sum_hip_amd.nim +++ b/examples/vector_sum_hip_amd.nim @@ -34,11 +34,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - )) + ) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/examples/vector_sum_hip_nvidia.nim b/examples/vector_sum_hip_nvidia.nim index 00102f0..165119b 100644 --- a/examples/vector_sum_hip_nvidia.nim +++ b/examples/vector_sum_hip_nvidia.nim @@ -37,11 +37,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - )) + ) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) diff --git a/src/cuda.nim b/src/cuda.nim index 296f0ea..f4601ed 100644 --- a/src/cuda.nim +++ b/src/cuda.nim @@ -55,9 +55,6 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {. importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".} -# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3; -# args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {. -# importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".} proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".} proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} diff --git a/src/hip.nim b/src/hip.nim index 942a412..ac15624 100644 --- a/src/hip.nim +++ b/src/hip.nim @@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; args: ptr pointer): hipError_t {. importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} -# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3; -# args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. -# importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} +proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3; + args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. + importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".} proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} @@ -68,6 +68,20 @@ proc hipLaunchKernelGGL*( ) {. importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.} +proc hipModuleLaunchKernel*( + function_address: pointer; + grid_dim_x: uint32_t; + grid_dim_y: uint32_t; + grid_dim_z: uint32_t; + block_dim_x: uint32_t; + block_dim_y: uint32_t; + block_dim_z: uint32_t; + sharedMemBytes: uint32_t; + stream: hipStream_t; + kernel_params: ptr pointer; + extra: ptr pointer; + ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".} + type ConstCString* {.importc: "const char*".} = object converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.} diff --git a/src/hippo.nim b/src/hippo.nim index 280d0e7..4339d85 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -134,88 +134,63 @@ proc `=destroy`*(mem: var GpuMemory) = # ------------------- # Kernel Execution -proc launchKernel*( - kernel: proc, - gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block - blockDim: Dim3 = newDim3(1,1,1), # default to 1 thread per block - sharedMemBytes: uint32 = 0, - stream: HippoStream = nil, - args: tuple -): HippoError = - # launchKernel is designed to be similar to `kernel`<<>>(args) - - # this function is horrible but it works - # needs to be refactored to handle all the different runtimes and arguments better - - # having some issues between hip and hip-cpu, so defining different versions of launchKernel - when HippoRuntime == "HIP" and HipPlatform == "amd": - # This branch works for all args - echo "executing HIP" - var kernelArgs: seq[pointer] - for key, arg in args.fieldPairs: - kernelArgs.add(cast[pointer](addr arg)) +template hippoLaunchKernel*( + kernel: proc, ## The GPU kernel procedure to launch + gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block + blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block + sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate + stream: HippoStream = nil, ## Which device stream to run under (defaults to null) + args: tuple, ## Arguments to pass to the GPU kernel +) = + var result: HippoError + ## Launch a kernel on the GPU. + ## also checks if launchKernel() returns an error. + ## Important: this only checks if the kernel launch was successful, not the kernel itself. + # + # This code is kinda gross, the launch kernel functions have a lot of different signatures. + var kernelArgs: seq[pointer] + for key, arg in args.fieldPairs: + let a1 = arg + kernelArgs.add(cast[pointer](addr a1)) + when HippoRuntime == "HIP": result = hipLaunchKernel( cast[pointer](kernel), gridDim, blockDim, cast[ptr pointer](addr kernelArgs[0]), + sharedMemBytes, + stream ) - elif HippoRuntime == "HIP" and HipPlatform == "nvidia": - # TODO fix args on this branch - hipLaunchKernelGGL( - kernel, - gridDim, - blockDim, - 0, # TODO - nil, # TODO - # TODO handle args properly - cast[ptr[cint]](args[0]), - cast[ptr[cint]](args[1]), - cast[ptr[cint]](args[2]) - ) - result = hipGetLastError() elif HippoRuntime == "HIP_CPU": - # TODO fix args on this branch - echo "executing kernel on CPU" - hipLaunchKernelGGL( - kernel, - gridDim, - blockDim, - 0, # TODO - nil, # TODO - # TODO handle args properly - args[0], - args[1], - args[2] + # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim + # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface. + result = hipModuleLaunchKernel( + cast[pointer](kernel), + gridDim.x, + gridDim.y, + gridDim.z, + blockDim.x, + blockDim.y, + blockDim.z, + sharedMemBytes, + stream, + cast[ptr pointer](addr kernelArgs[0]), + nil ) - result = hipGetLastError() elif HippoRuntime == "CUDA": - # This branch works for all args - echo "executing CUDA" - var kernelArgs: seq[pointer] - for key, arg in args.fieldPairs: - kernelArgs.add(cast[pointer](addr arg)) result = cudaLaunchKernel( kernel, gridDim, blockDim, - cast[ptr pointer](addr kernelArgs[0]) - #sharedMemBytes, - #stream + cast[ptr pointer](addr kernelArgs[0]), + sharedMemBytes, + stream ) else: raise newException(Exception, &"Unknown runtime: {HippoRuntime}") -template hippoLaunchKernel*( - kernel: proc, ## The GPU kernel procedure to launch - gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block - blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block - sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate - stream: HippoStream = nil, ## Which device stream to run under (defaults to null) - args: tuple, ## Arguments to pass to the GPU kernel -) = - ## Launch a kernel on the GPU and check for errors - handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args)) + handleError(result) + # ------------------- diff --git a/tests/hip/call_params.nim b/tests/hip/call_params.nim index 7013666..edb4f72 100644 --- a/tests/hip/call_params.nim +++ b/tests/hip/call_params.nim @@ -8,10 +8,10 @@ proc main() = var c: int32 var dev_c: ptr[int32] handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint)) - handleError(launchKernel( + hippoLaunchKernel( addKernel, args = (2,7,dev_c) - )) + ) handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost)) echo "2 + 7 = ", c handleError(hipFree(dev_c)) diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim index 3b96f4a..6c97bb6 100644 --- a/tests/hip/dot.nim +++ b/tests/hip/dot.nim @@ -62,12 +62,12 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( dot, gridDim = newDim3(BlocksPerGrid.uint32), blockDim = newDim3(ThreadsPerBlock.uint32), args = (dev_a, dev_b, dev_partial_c) - )) + ) # copy memory back from GPU to CPU handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost)) diff --git a/tests/hip/vector_sum.nim b/tests/hip/vector_sum.nim index 9e7f983..8fa62b9 100644 --- a/tests/hip/vector_sum.nim +++ b/tests/hip/vector_sum.nim @@ -29,11 +29,11 @@ proc main() = handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice)) # launch kernel - handleError(launchKernel( + hippoLaunchKernel( addkernel, gridDim = newDim3(N.uint32), args = (dev_a, dev_b, dev_c) - )) + ) # copy result back to host handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost)) From 052d1874f8a6766500ccf03def1e85f13e19dfcf Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 20:38:28 -0400 Subject: [PATCH 03/12] fix kernel args --- .gitignore | 1 + Dockerfile | 16 ++++++++++++++++ docker-compose.yaml | 9 +++++++++ src/hippo.nim | 4 ++-- 4 files changed, 28 insertions(+), 2 deletions(-) create mode 100644 Dockerfile create mode 100644 docker-compose.yaml diff --git a/.gitignore b/.gitignore index f5e0412..21240fa 100644 --- a/.gitignore +++ b/.gitignore @@ -2,6 +2,7 @@ * !*/ !*.* +!Dockerfile # normal ignores: *.exe diff --git a/Dockerfile b/Dockerfile new file mode 100644 index 0000000..c3ae070 --- /dev/null +++ b/Dockerfile @@ -0,0 +1,16 @@ +FROM ubuntu:22.04 + +RUN apt update && apt install -y git gcc g++ make libtbb2-dev && apt clean + +RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim + +WORKDIR /opt/Nim +RUN sh ./build_all.sh + +RUN ./bin/nim c koch +RUN ./koch boot -d:release +RUN ./koch tools + +RUN mkdir -p /root/.nimble/bin + +ENV PATH="/opt/Nim/bin/:/root/.nimble/bin:${PATH}" diff --git a/docker-compose.yaml b/docker-compose.yaml new file mode 100644 index 0000000..408f609 --- /dev/null +++ b/docker-compose.yaml @@ -0,0 +1,9 @@ +version: '3.8' + +services: + hippo-build: + build: + context: . + dockerfile: ./Dockerfile + volumes: + - .:/p/hippo \ No newline at end of file diff --git a/src/hippo.nim b/src/hippo.nim index 4339d85..70b9a1e 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -174,8 +174,8 @@ template hippoLaunchKernel*( blockDim.z, sharedMemBytes, stream, - cast[ptr pointer](addr kernelArgs[0]), - nil + nil, + cast[ptr pointer](addr kernelArgs[0]) ) elif HippoRuntime == "CUDA": result = cudaLaunchKernel( From 2c884a6c680a5e4bde5fa96012ff23df0c5c76f8 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 21:27:00 -0400 Subject: [PATCH 04/12] back where I started --- Dockerfile | 2 +- src/hip.nim | 15 --------------- src/hippo.nim | 41 +++++++++++++++++++++++++++-------------- 3 files changed, 28 insertions(+), 30 deletions(-) diff --git a/Dockerfile b/Dockerfile index c3ae070..8da4353 100644 --- a/Dockerfile +++ b/Dockerfile @@ -1,6 +1,6 @@ FROM ubuntu:22.04 -RUN apt update && apt install -y git gcc g++ make libtbb2-dev && apt clean +RUN apt update && apt install -y git gdb gcc g++ make libtbb2-dev && apt clean RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim diff --git a/src/hip.nim b/src/hip.nim index ac15624..503e5cc 100644 --- a/src/hip.nim +++ b/src/hip.nim @@ -68,21 +68,6 @@ proc hipLaunchKernelGGL*( ) {. importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.} -proc hipModuleLaunchKernel*( - function_address: pointer; - grid_dim_x: uint32_t; - grid_dim_y: uint32_t; - grid_dim_z: uint32_t; - block_dim_x: uint32_t; - block_dim_y: uint32_t; - block_dim_z: uint32_t; - sharedMemBytes: uint32_t; - stream: hipStream_t; - kernel_params: ptr pointer; - extra: ptr pointer; - ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".} - - type ConstCString* {.importc: "const char*".} = object converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.} converter toConstCString*(self: cstring): ConstCString {.importc: "(const char*)", noconv, nodecl.} diff --git a/src/hippo.nim b/src/hippo.nim index 70b9a1e..d738518 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -140,7 +140,7 @@ template hippoLaunchKernel*( blockDim: Dim3 = newDim3(1,1,1), ## default to 1 thread per block sharedMemBytes: uint32 = 0, ## dynamic shared memory amount to allocate stream: HippoStream = nil, ## Which device stream to run under (defaults to null) - args: tuple, ## Arguments to pass to the GPU kernel + args: tuple, ## Arguments to pass to the GPU kernel ) = var result: HippoError ## Launch a kernel on the GPU. @@ -152,7 +152,7 @@ template hippoLaunchKernel*( for key, arg in args.fieldPairs: let a1 = arg kernelArgs.add(cast[pointer](addr a1)) - when HippoRuntime == "HIP": + when HippoRuntime == "HIP" and HipPlatform == "amd": result = hipLaunchKernel( cast[pointer](kernel), gridDim, @@ -161,22 +161,35 @@ template hippoLaunchKernel*( sharedMemBytes, stream ) + elif HippoRuntime == "HIP" and HipPlatform == "nvidia": + hipLaunchKernelGGL( + kernel, + gridDim, + blockDim, + sharedMemBytes, + stream, + # TODO handle args properly + cast[ptr[cint]](args[0]), + cast[ptr[cint]](args[1]), + cast[ptr[cint]](args[2]) + ) + result = hipGetLastError() elif HippoRuntime == "HIP_CPU": - # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim - # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface. - result = hipModuleLaunchKernel( - cast[pointer](kernel), - gridDim.x, - gridDim.y, - gridDim.z, - blockDim.x, - blockDim.y, - blockDim.z, + + # wonder if it's possible to build a macro to turn kernelArgs to args? + # the cpp interface is tricky + hipLaunchKernelGGL( + kernel, + gridDim, + blockDim, sharedMemBytes, stream, - nil, - cast[ptr pointer](addr kernelArgs[0]) + # TODO handle args properly + args[0], + args[1], + args[2] ) + result = hipGetLastError() elif HippoRuntime == "CUDA": result = cudaLaunchKernel( kernel, From f8fce03174a055671f12fef53c918d4d16e0d089 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 21:45:36 -0400 Subject: [PATCH 05/12] fix varargs --- src/hippo.nim | 39 +++++++++++++++++++++++++++------------ 1 file changed, 27 insertions(+), 12 deletions(-) diff --git a/src/hippo.nim b/src/hippo.nim index d738518..2b4b0ff 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -134,6 +134,29 @@ proc `=destroy`*(mem: var GpuMemory) = # ------------------- # Kernel Execution +macro hipLaunchKernelGGLWithTuple( + kernel: proc, + gridDim: Dim3 = newDim3(1,1,1), + blockDim: Dim3 = newDim3(1,1,1), + sharedMemBytes: uint32 = 0, + stream: HippoStream = nil, + args: tuple + ): untyped = + + var callNode = newCall(bindSym"hipLaunchKernelGGL") + + # add the fixed vars + callNode.add kernel + callNode.add gridDim + callNode.add blockDim + callNode.add sharedMemBytes + callNode.add stream + + # add every value of the tuple + for child in args: + callNode.add child + result = callNode + template hippoLaunchKernel*( kernel: proc, ## The GPU kernel procedure to launch gridDim: Dim3 = newDim3(1,1,1), ## default to a grid of 1 block @@ -162,32 +185,24 @@ template hippoLaunchKernel*( stream ) elif HippoRuntime == "HIP" and HipPlatform == "nvidia": - hipLaunchKernelGGL( + hipLaunchKernelGGLWithTuple( kernel, gridDim, blockDim, sharedMemBytes, stream, - # TODO handle args properly - cast[ptr[cint]](args[0]), - cast[ptr[cint]](args[1]), - cast[ptr[cint]](args[2]) + args ) result = hipGetLastError() elif HippoRuntime == "HIP_CPU": - # wonder if it's possible to build a macro to turn kernelArgs to args? - # the cpp interface is tricky - hipLaunchKernelGGL( + hipLaunchKernelGGLWithTuple( kernel, gridDim, blockDim, sharedMemBytes, stream, - # TODO handle args properly - args[0], - args[1], - args[2] + args ) result = hipGetLastError() elif HippoRuntime == "CUDA": From c771ed37fd35829df829422d279993b924f91b0f Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 21:48:55 -0400 Subject: [PATCH 06/12] bump version --- hippo.nimble | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/hippo.nimble b/hippo.nimble index 7ada4ff..348d8c7 100644 --- a/hippo.nimble +++ b/hippo.nimble @@ -1,4 +1,4 @@ -version = "0.5.2" +version = "0.5.5" author = "Andrew Brower" description = "HIP library for Nim" license = "MIT" From 83634f9e6cd30db2b5782276c69ed1332e2f5f8a Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:03:28 -0400 Subject: [PATCH 07/12] fixes --- src/hippo.nim | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/src/hippo.nim b/src/hippo.nim index 2b4b0ff..00b8ad3 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -135,7 +135,7 @@ proc `=destroy`*(mem: var GpuMemory) = # Kernel Execution macro hipLaunchKernelGGLWithTuple( - kernel: proc, + kernel: proc, gridDim: Dim3 = newDim3(1,1,1), blockDim: Dim3 = newDim3(1,1,1), sharedMemBytes: uint32 = 0, @@ -184,18 +184,7 @@ template hippoLaunchKernel*( sharedMemBytes, stream ) - elif HippoRuntime == "HIP" and HipPlatform == "nvidia": - hipLaunchKernelGGLWithTuple( - kernel, - gridDim, - blockDim, - sharedMemBytes, - stream, - args - ) - result = hipGetLastError() - elif HippoRuntime == "HIP_CPU": - + elif (HippoRuntime == "HIP" and HipPlatform == "nvidia") or HippoRuntime == "HIP_CPU": hipLaunchKernelGGLWithTuple( kernel, gridDim, From 6a2ba81e4ea16d15f8cc74b5e38addf102439111 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:04:45 -0400 Subject: [PATCH 08/12] ensure cpu runtime for vector_sum_cpu --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 743ed86..0b95d2d 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: - name: Build Examples run: | - nim cpp examples/vector_sum_cpu.nim + HIP_PLATFORM=HIP_CPU nim cpp examples/vector_sum_cpu.nim nim cpp examples/vector_sum_cuda.nim nim cpp examples/vector_sum_hippo.nim nim cpp examples/vector_sum_hip_amd.nim From fc8561e0a7148029fb075aa5ede7d0afc1966b00 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:20:25 -0400 Subject: [PATCH 09/12] undo env var --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 0b95d2d..743ed86 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: - name: Build Examples run: | - HIP_PLATFORM=HIP_CPU nim cpp examples/vector_sum_cpu.nim + nim cpp examples/vector_sum_cpu.nim nim cpp examples/vector_sum_cuda.nim nim cpp examples/vector_sum_hippo.nim nim cpp examples/vector_sum_hip_amd.nim From 662419dc6d099867b92b2579054717ffeb9fe4d5 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:22:39 -0400 Subject: [PATCH 10/12] skip cpu example for now --- .github/workflows/build.yml | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 743ed86..6c39094 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,12 +35,12 @@ jobs: - name: Build Examples run: | - nim cpp examples/vector_sum_cpu.nim + #nim cpp examples/vector_sum_cpu.nim nim cpp examples/vector_sum_cuda.nim nim cpp examples/vector_sum_hippo.nim nim cpp examples/vector_sum_hip_amd.nim #nim cpp examples/vector_sum_hip_nvidia.nim - - name: Execute CPU Example - run: | - ./examples/vector_sum_cpu + # - name: Execute CPU Example + # run: | + # ./examples/vector_sum_cpu From 317862222ce17240ede8555349f7a4fd4cc063e8 Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:33:10 -0400 Subject: [PATCH 11/12] oops --- .github/workflows/build.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index 6c39094..a2a89c2 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -35,7 +35,7 @@ jobs: - name: Build Examples run: | - #nim cpp examples/vector_sum_cpu.nim + nim cpp examples/vector_sum_cpu.nim nim cpp examples/vector_sum_cuda.nim nim cpp examples/vector_sum_hippo.nim nim cpp examples/vector_sum_hip_amd.nim From bc52fbb2ee2269d3952e1f937306cb45a720ebab Mon Sep 17 00:00:00 2001 From: monofuel Date: Sun, 1 Sep 2024 22:34:53 -0400 Subject: [PATCH 12/12] conditionally define macro --- src/hippo.nim | 45 +++++++++++++++++++++++---------------------- 1 file changed, 23 insertions(+), 22 deletions(-) diff --git a/src/hippo.nim b/src/hippo.nim index 00b8ad3..7471e96 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -134,28 +134,29 @@ proc `=destroy`*(mem: var GpuMemory) = # ------------------- # Kernel Execution -macro hipLaunchKernelGGLWithTuple( - kernel: proc, - gridDim: Dim3 = newDim3(1,1,1), - blockDim: Dim3 = newDim3(1,1,1), - sharedMemBytes: uint32 = 0, - stream: HippoStream = nil, - args: tuple - ): untyped = - - var callNode = newCall(bindSym"hipLaunchKernelGGL") - - # add the fixed vars - callNode.add kernel - callNode.add gridDim - callNode.add blockDim - callNode.add sharedMemBytes - callNode.add stream - - # add every value of the tuple - for child in args: - callNode.add child - result = callNode +when HippoRuntime == "HIP" or HippoRuntime == "HIP_CPU": + macro hipLaunchKernelGGLWithTuple( + kernel: proc, + gridDim: Dim3 = newDim3(1,1,1), + blockDim: Dim3 = newDim3(1,1,1), + sharedMemBytes: uint32 = 0, + stream: HippoStream = nil, + args: tuple + ): untyped = + + var callNode = newCall(bindSym"hipLaunchKernelGGL") + + # add the fixed vars + callNode.add kernel + callNode.add gridDim + callNode.add blockDim + callNode.add sharedMemBytes + callNode.add stream + + # add every value of the tuple + for child in args: + callNode.add child + result = callNode template hippoLaunchKernel*( kernel: proc, ## The GPU kernel procedure to launch