From abd56d1833a2382fc896d0c94a718b7df38d9d0b Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 20:12:57 -0400
Subject: [PATCH 01/12] Revert "varargs fixes"

This reverts commit 85adef7fe2fbe2b415a7a04398dabbfe1579f01e.
---
 README.md                          |   2 +-
 examples/vector_sum_cpu.nim        |   4 +-
 examples/vector_sum_cuda.nim       |   4 +-
 examples/vector_sum_hip_amd.nim    |   4 +-
 examples/vector_sum_hip_nvidia.nim |   4 +-
 src/cuda.nim                       |   3 +
 src/hip.nim                        |  20 +-----
 src/hippo.nim                      | 105 ++++++++++++++++++-----------
 tests/hip/call_params.nim          |   4 +-
 tests/hip/dot.nim                  |   4 +-
 tests/hip/vector_sum.nim           |   4 +-
 11 files changed, 86 insertions(+), 72 deletions(-)

diff --git a/README.md b/README.md
index b58c32a..f791e0a 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} =
 var c: int32
 var dev_c: ptr[int32]
 handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-hippoLaunchKernel(addKernel,args = (2,7,dev_c))
+handleError(launchKernel(addKernel,args = (2,7,dev_c)))
 handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
 echo "2 + 7 = ", c
 handleError(hipFree(dev_c))
diff --git a/examples/vector_sum_cpu.nim b/examples/vector_sum_cpu.nim
index 4d940c0..2b9cb24 100644
--- a/examples/vector_sum_cpu.nim
+++ b/examples/vector_sum_cpu.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  )
+  ))
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/examples/vector_sum_cuda.nim b/examples/vector_sum_cuda.nim
index 01f2c45..5f0fa21 100644
--- a/examples/vector_sum_cuda.nim
+++ b/examples/vector_sum_cuda.nim
@@ -35,11 +35,11 @@ proc main() =
   handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  )
+  ))
 
   # copy result back to host
   handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost))
diff --git a/examples/vector_sum_hip_amd.nim b/examples/vector_sum_hip_amd.nim
index 8bdb762..3aba581 100644
--- a/examples/vector_sum_hip_amd.nim
+++ b/examples/vector_sum_hip_amd.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  )
+  ))
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/examples/vector_sum_hip_nvidia.nim b/examples/vector_sum_hip_nvidia.nim
index 165119b..00102f0 100644
--- a/examples/vector_sum_hip_nvidia.nim
+++ b/examples/vector_sum_hip_nvidia.nim
@@ -37,11 +37,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  )
+  ))
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/src/cuda.nim b/src/cuda.nim
index f4601ed..296f0ea 100644
--- a/src/cuda.nim
+++ b/src/cuda.nim
@@ -55,6 +55,9 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di
 proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {.
     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
+# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
+#                      args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {.
+#     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
 proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".}
 proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
diff --git a/src/hip.nim b/src/hip.nim
index ac15624..942a412 100644
--- a/src/hip.nim
+++ b/src/hip.nim
@@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp
 proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer): hipError_t {.
     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
-proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
-                     args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
-    importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
+# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
+#                      args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
+#     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
 proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".}
 proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
@@ -68,20 +68,6 @@ proc hipLaunchKernelGGL*(
   ) {.
     importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.}
 
-proc hipModuleLaunchKernel*(
-  function_address: pointer;
-  grid_dim_x: uint32_t;
-  grid_dim_y: uint32_t;
-  grid_dim_z: uint32_t;
-  block_dim_x: uint32_t;
-  block_dim_y: uint32_t;
-  block_dim_z: uint32_t;
-  sharedMemBytes: uint32_t;
-  stream: hipStream_t;
-  kernel_params: ptr pointer;
-  extra: ptr pointer;
-  ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".}
-
 
 type ConstCString* {.importc: "const char*".} = object
 converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.}
diff --git a/src/hippo.nim b/src/hippo.nim
index 4339d85..280d0e7 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -134,63 +134,88 @@ proc `=destroy`*(mem: var GpuMemory) =
 # -------------------
 # Kernel Execution
 
-template hippoLaunchKernel*(
-  kernel: proc,                     ## The GPU kernel procedure to launch
-  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
-  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
-  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
-  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
-  args: tuple,                      ## Arguments to pass to the GPU kernel
-) =
-  var result: HippoError
-  ## Launch a kernel on the GPU.
-  ## also checks if launchKernel() returns an error.
-  ## Important: this only checks if the kernel launch was successful, not the kernel itself.
-  # 
-  # This code is kinda gross, the launch kernel functions have a lot of different signatures.
-  var kernelArgs: seq[pointer]
-  for key, arg in args.fieldPairs:
-    let a1 = arg
-    kernelArgs.add(cast[pointer](addr a1))
-  when HippoRuntime == "HIP":
+proc launchKernel*(
+  kernel: proc,
+  gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block
+  blockDim: Dim3 = newDim3(1,1,1),  # default to 1 thread per block
+  sharedMemBytes: uint32 = 0,
+  stream: HippoStream = nil,
+  args: tuple
+): HippoError =
+  # launchKernel is designed to be similar to `kernel`<<<blockDim, gridDim>>>(args)
+
+  # this function is horrible but it works
+  # needs to be refactored to handle all the different runtimes and arguments better
+
+  # having some issues between hip and hip-cpu, so defining different versions of launchKernel
+  when HippoRuntime == "HIP" and HipPlatform == "amd":
+    # This branch works for all args
+    echo "executing HIP"
+    var kernelArgs: seq[pointer]
+    for key, arg in args.fieldPairs:
+      kernelArgs.add(cast[pointer](addr arg))
     result = hipLaunchKernel(
       cast[pointer](kernel),
       gridDim,
       blockDim,
       cast[ptr pointer](addr kernelArgs[0]),
-      sharedMemBytes,
-      stream
     )
+  elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
+    # TODO fix args on this branch
+    hipLaunchKernelGGL(
+      kernel,
+      gridDim,
+      blockDim,
+      0, # TODO
+      nil, # TODO
+      # TODO handle args properly
+      cast[ptr[cint]](args[0]),
+      cast[ptr[cint]](args[1]),
+      cast[ptr[cint]](args[2])
+      )
+    result = hipGetLastError()
   elif HippoRuntime == "HIP_CPU":
-    # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim
-    # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface.
-    result = hipModuleLaunchKernel(
-      cast[pointer](kernel),
-      gridDim.x,
-      gridDim.y,
-      gridDim.z,
-      blockDim.x,
-      blockDim.y,
-      blockDim.z,
-      sharedMemBytes,
-      stream,
-      cast[ptr pointer](addr kernelArgs[0]),
-      nil
+    # TODO fix args on this branch
+    echo "executing kernel on CPU"
+    hipLaunchKernelGGL(
+      kernel,
+      gridDim,
+      blockDim,
+      0, # TODO
+      nil, # TODO
+      # TODO handle args properly
+      args[0],
+      args[1],
+      args[2]
     )
+    result = hipGetLastError()
   elif HippoRuntime == "CUDA":
+    # This branch works for all args
+    echo "executing CUDA"
+    var kernelArgs: seq[pointer]
+    for key, arg in args.fieldPairs:
+      kernelArgs.add(cast[pointer](addr arg))
     result = cudaLaunchKernel(
       kernel,
       gridDim,
       blockDim,
-      cast[ptr pointer](addr kernelArgs[0]),
-      sharedMemBytes,
-      stream
+      cast[ptr pointer](addr kernelArgs[0])
+      #sharedMemBytes,
+      #stream
     )
   else:
     raise newException(Exception, &"Unknown runtime: {HippoRuntime}")
 
-  handleError(result)
-  
+template hippoLaunchKernel*(
+  kernel: proc,                     ## The GPU kernel procedure to launch
+  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
+  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
+  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
+  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
+  args: tuple,                      ## Arguments to pass to the GPU kernel
+) =
+  ## Launch a kernel on the GPU and check for errors
+  handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args))
 
 
 # -------------------
diff --git a/tests/hip/call_params.nim b/tests/hip/call_params.nim
index edb4f72..7013666 100644
--- a/tests/hip/call_params.nim
+++ b/tests/hip/call_params.nim
@@ -8,10 +8,10 @@ proc main() =
   var c: int32
   var dev_c: ptr[int32]
   handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addKernel,
     args = (2,7,dev_c)
-  )
+  ))
   handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
   echo "2 + 7 = ", c
   handleError(hipFree(dev_c))
diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim
index 6c97bb6..3b96f4a 100644
--- a/tests/hip/dot.nim
+++ b/tests/hip/dot.nim
@@ -62,12 +62,12 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     dot,
     gridDim = newDim3(BlocksPerGrid.uint32),
     blockDim = newDim3(ThreadsPerBlock.uint32),
     args = (dev_a, dev_b, dev_partial_c)
-  )
+  ))
 
   # copy memory back from GPU to CPU
   handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost))
diff --git a/tests/hip/vector_sum.nim b/tests/hip/vector_sum.nim
index 8fa62b9..9e7f983 100644
--- a/tests/hip/vector_sum.nim
+++ b/tests/hip/vector_sum.nim
@@ -29,11 +29,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  hippoLaunchKernel(
+  handleError(launchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  )
+  ))
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))

From 2b0266a1103ab48354dbf519e1096a28b8ed6ae4 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 20:13:38 -0400
Subject: [PATCH 02/12] Reapply "varargs fixes"

This reverts commit 77ab1372c6a3b8a89c3cedcde0bf441bd0ef51ba.
---
 README.md                          |   2 +-
 examples/vector_sum_cpu.nim        |   4 +-
 examples/vector_sum_cuda.nim       |   4 +-
 examples/vector_sum_hip_amd.nim    |   4 +-
 examples/vector_sum_hip_nvidia.nim |   4 +-
 src/cuda.nim                       |   3 -
 src/hip.nim                        |  20 +++++-
 src/hippo.nim                      | 105 +++++++++++------------------
 tests/hip/call_params.nim          |   4 +-
 tests/hip/dot.nim                  |   4 +-
 tests/hip/vector_sum.nim           |   4 +-
 11 files changed, 72 insertions(+), 86 deletions(-)

diff --git a/README.md b/README.md
index f791e0a..b58c32a 100644
--- a/README.md
+++ b/README.md
@@ -20,7 +20,7 @@ proc addKernel*(a, b: cint; c: ptr[cint]) {.hippoGlobal.} =
 var c: int32
 var dev_c: ptr[int32]
 handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-handleError(launchKernel(addKernel,args = (2,7,dev_c)))
+hippoLaunchKernel(addKernel,args = (2,7,dev_c))
 handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
 echo "2 + 7 = ", c
 handleError(hipFree(dev_c))
diff --git a/examples/vector_sum_cpu.nim b/examples/vector_sum_cpu.nim
index 2b9cb24..4d940c0 100644
--- a/examples/vector_sum_cpu.nim
+++ b/examples/vector_sum_cpu.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/examples/vector_sum_cuda.nim b/examples/vector_sum_cuda.nim
index 5f0fa21..01f2c45 100644
--- a/examples/vector_sum_cuda.nim
+++ b/examples/vector_sum_cuda.nim
@@ -35,11 +35,11 @@ proc main() =
   handleError(cudaMemcpy(dev_b, addr b[0], sizeof(int32)*N, cudaMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(cudaMemcpy(addr c[0], dev_c, sizeof(int32)*N, cudaMemcpyDeviceToHost))
diff --git a/examples/vector_sum_hip_amd.nim b/examples/vector_sum_hip_amd.nim
index 3aba581..8bdb762 100644
--- a/examples/vector_sum_hip_amd.nim
+++ b/examples/vector_sum_hip_amd.nim
@@ -34,11 +34,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/examples/vector_sum_hip_nvidia.nim b/examples/vector_sum_hip_nvidia.nim
index 00102f0..165119b 100644
--- a/examples/vector_sum_hip_nvidia.nim
+++ b/examples/vector_sum_hip_nvidia.nim
@@ -37,11 +37,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))
diff --git a/src/cuda.nim b/src/cuda.nim
index 296f0ea..f4601ed 100644
--- a/src/cuda.nim
+++ b/src/cuda.nim
@@ -55,9 +55,6 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di
 proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer, sharedMemBytes: uint32_t, stream: cudaStream_t): cudaError_t {.
     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
-# proc cudaLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
-#                      args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {.
-#     importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".}
 proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".}
 proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".}
diff --git a/src/hip.nim b/src/hip.nim
index 942a412..ac15624 100644
--- a/src/hip.nim
+++ b/src/hip.nim
@@ -52,9 +52,9 @@ proc hipFree*(`ptr`: pointer): hipError_t {.header: "hip/hip_runtime.h",importcp
 proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
                      args: ptr pointer): hipError_t {.
     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
-# proc hipLaunchKernel*(function_address: pointer; numBlocks: dim3; dimBlocks: dim3;
-#                      args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
-#     importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
+proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim3;
+                     args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {.
+    importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".}
 proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".}
 proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
 proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".}
@@ -68,6 +68,20 @@ proc hipLaunchKernelGGL*(
   ) {.
     importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.}
 
+proc hipModuleLaunchKernel*(
+  function_address: pointer;
+  grid_dim_x: uint32_t;
+  grid_dim_y: uint32_t;
+  grid_dim_z: uint32_t;
+  block_dim_x: uint32_t;
+  block_dim_y: uint32_t;
+  block_dim_z: uint32_t;
+  sharedMemBytes: uint32_t;
+  stream: hipStream_t;
+  kernel_params: ptr pointer;
+  extra: ptr pointer;
+  ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".}
+
 
 type ConstCString* {.importc: "const char*".} = object
 converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.}
diff --git a/src/hippo.nim b/src/hippo.nim
index 280d0e7..4339d85 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -134,88 +134,63 @@ proc `=destroy`*(mem: var GpuMemory) =
 # -------------------
 # Kernel Execution
 
-proc launchKernel*(
-  kernel: proc,
-  gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block
-  blockDim: Dim3 = newDim3(1,1,1),  # default to 1 thread per block
-  sharedMemBytes: uint32 = 0,
-  stream: HippoStream = nil,
-  args: tuple
-): HippoError =
-  # launchKernel is designed to be similar to `kernel`<<<blockDim, gridDim>>>(args)
-
-  # this function is horrible but it works
-  # needs to be refactored to handle all the different runtimes and arguments better
-
-  # having some issues between hip and hip-cpu, so defining different versions of launchKernel
-  when HippoRuntime == "HIP" and HipPlatform == "amd":
-    # This branch works for all args
-    echo "executing HIP"
-    var kernelArgs: seq[pointer]
-    for key, arg in args.fieldPairs:
-      kernelArgs.add(cast[pointer](addr arg))
+template hippoLaunchKernel*(
+  kernel: proc,                     ## The GPU kernel procedure to launch
+  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
+  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
+  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
+  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
+  args: tuple,                      ## Arguments to pass to the GPU kernel
+) =
+  var result: HippoError
+  ## Launch a kernel on the GPU.
+  ## also checks if launchKernel() returns an error.
+  ## Important: this only checks if the kernel launch was successful, not the kernel itself.
+  # 
+  # This code is kinda gross, the launch kernel functions have a lot of different signatures.
+  var kernelArgs: seq[pointer]
+  for key, arg in args.fieldPairs:
+    let a1 = arg
+    kernelArgs.add(cast[pointer](addr a1))
+  when HippoRuntime == "HIP":
     result = hipLaunchKernel(
       cast[pointer](kernel),
       gridDim,
       blockDim,
       cast[ptr pointer](addr kernelArgs[0]),
+      sharedMemBytes,
+      stream
     )
-  elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
-    # TODO fix args on this branch
-    hipLaunchKernelGGL(
-      kernel,
-      gridDim,
-      blockDim,
-      0, # TODO
-      nil, # TODO
-      # TODO handle args properly
-      cast[ptr[cint]](args[0]),
-      cast[ptr[cint]](args[1]),
-      cast[ptr[cint]](args[2])
-      )
-    result = hipGetLastError()
   elif HippoRuntime == "HIP_CPU":
-    # TODO fix args on this branch
-    echo "executing kernel on CPU"
-    hipLaunchKernelGGL(
-      kernel,
-      gridDim,
-      blockDim,
-      0, # TODO
-      nil, # TODO
-      # TODO handle args properly
-      args[0],
-      args[1],
-      args[2]
+    # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim
+    # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface.
+    result = hipModuleLaunchKernel(
+      cast[pointer](kernel),
+      gridDim.x,
+      gridDim.y,
+      gridDim.z,
+      blockDim.x,
+      blockDim.y,
+      blockDim.z,
+      sharedMemBytes,
+      stream,
+      cast[ptr pointer](addr kernelArgs[0]),
+      nil
     )
-    result = hipGetLastError()
   elif HippoRuntime == "CUDA":
-    # This branch works for all args
-    echo "executing CUDA"
-    var kernelArgs: seq[pointer]
-    for key, arg in args.fieldPairs:
-      kernelArgs.add(cast[pointer](addr arg))
     result = cudaLaunchKernel(
       kernel,
       gridDim,
       blockDim,
-      cast[ptr pointer](addr kernelArgs[0])
-      #sharedMemBytes,
-      #stream
+      cast[ptr pointer](addr kernelArgs[0]),
+      sharedMemBytes,
+      stream
     )
   else:
     raise newException(Exception, &"Unknown runtime: {HippoRuntime}")
 
-template hippoLaunchKernel*(
-  kernel: proc,                     ## The GPU kernel procedure to launch
-  gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
-  blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
-  sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
-  stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
-  args: tuple,                      ## Arguments to pass to the GPU kernel
-) =
-  ## Launch a kernel on the GPU and check for errors
-  handleError(launchKernel(kernel, gridDim, blockDim, sharedMemBytes, stream, args))
+  handleError(result)
+  
 
 
 # -------------------
diff --git a/tests/hip/call_params.nim b/tests/hip/call_params.nim
index 7013666..edb4f72 100644
--- a/tests/hip/call_params.nim
+++ b/tests/hip/call_params.nim
@@ -8,10 +8,10 @@ proc main() =
   var c: int32
   var dev_c: ptr[int32]
   handleError(hipMalloc(cast[ptr pointer](addr dev_c), sizeof(int32).cint))
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addKernel,
     args = (2,7,dev_c)
-  ))
+  )
   handleError(hipMemcpy(addr c, dev_c, sizeof(int32).cint, hipMemcpyDeviceToHost))
   echo "2 + 7 = ", c
   handleError(hipFree(dev_c))
diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim
index 3b96f4a..6c97bb6 100644
--- a/tests/hip/dot.nim
+++ b/tests/hip/dot.nim
@@ -62,12 +62,12 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(float64)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     dot,
     gridDim = newDim3(BlocksPerGrid.uint32),
     blockDim = newDim3(ThreadsPerBlock.uint32),
     args = (dev_a, dev_b, dev_partial_c)
-  ))
+  )
 
   # copy memory back from GPU to CPU
   handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float64), hipMemcpyDeviceToHost))
diff --git a/tests/hip/vector_sum.nim b/tests/hip/vector_sum.nim
index 9e7f983..8fa62b9 100644
--- a/tests/hip/vector_sum.nim
+++ b/tests/hip/vector_sum.nim
@@ -29,11 +29,11 @@ proc main() =
   handleError(hipMemcpy(dev_b, addr b[0], sizeof(int32)*N, hipMemcpyHostToDevice))
 
   # launch kernel
-  handleError(launchKernel(
+  hippoLaunchKernel(
     addkernel,
     gridDim = newDim3(N.uint32),
     args = (dev_a, dev_b, dev_c)
-  ))
+  )
 
   # copy result back to host
   handleError(hipMemcpy(addr c[0], dev_c, sizeof(int32)*N, hipMemcpyDeviceToHost))

From 052d1874f8a6766500ccf03def1e85f13e19dfcf Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 20:38:28 -0400
Subject: [PATCH 03/12] fix kernel args

---
 .gitignore          |  1 +
 Dockerfile          | 16 ++++++++++++++++
 docker-compose.yaml |  9 +++++++++
 src/hippo.nim       |  4 ++--
 4 files changed, 28 insertions(+), 2 deletions(-)
 create mode 100644 Dockerfile
 create mode 100644 docker-compose.yaml

diff --git a/.gitignore b/.gitignore
index f5e0412..21240fa 100644
--- a/.gitignore
+++ b/.gitignore
@@ -2,6 +2,7 @@
 *
 !*/
 !*.*
+!Dockerfile
 
 # normal ignores:
 *.exe
diff --git a/Dockerfile b/Dockerfile
new file mode 100644
index 0000000..c3ae070
--- /dev/null
+++ b/Dockerfile
@@ -0,0 +1,16 @@
+FROM ubuntu:22.04
+
+RUN apt update && apt install -y git gcc g++ make libtbb2-dev && apt clean
+
+RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim
+
+WORKDIR /opt/Nim
+RUN sh ./build_all.sh 
+
+RUN ./bin/nim c koch
+RUN ./koch boot -d:release
+RUN ./koch tools
+
+RUN mkdir -p /root/.nimble/bin
+
+ENV PATH="/opt/Nim/bin/:/root/.nimble/bin:${PATH}"
diff --git a/docker-compose.yaml b/docker-compose.yaml
new file mode 100644
index 0000000..408f609
--- /dev/null
+++ b/docker-compose.yaml
@@ -0,0 +1,9 @@
+version: '3.8'
+
+services:
+  hippo-build:
+    build:
+      context: .
+      dockerfile: ./Dockerfile
+    volumes:
+      - .:/p/hippo
\ No newline at end of file
diff --git a/src/hippo.nim b/src/hippo.nim
index 4339d85..70b9a1e 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -174,8 +174,8 @@ template hippoLaunchKernel*(
       blockDim.z,
       sharedMemBytes,
       stream,
-      cast[ptr pointer](addr kernelArgs[0]),
-      nil
+      nil,
+      cast[ptr pointer](addr kernelArgs[0])
     )
   elif HippoRuntime == "CUDA":
     result = cudaLaunchKernel(

From 2c884a6c680a5e4bde5fa96012ff23df0c5c76f8 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 21:27:00 -0400
Subject: [PATCH 04/12] back where I started

---
 Dockerfile    |  2 +-
 src/hip.nim   | 15 ---------------
 src/hippo.nim | 41 +++++++++++++++++++++++++++--------------
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/Dockerfile b/Dockerfile
index c3ae070..8da4353 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:22.04
 
-RUN apt update && apt install -y git gcc g++ make libtbb2-dev && apt clean
+RUN apt update && apt install -y git gdb gcc g++ make libtbb2-dev && apt clean
 
 RUN git clone --branch devel https://github.com/nim-lang/Nim.git --depth 1 /opt/Nim
 
diff --git a/src/hip.nim b/src/hip.nim
index ac15624..503e5cc 100644
--- a/src/hip.nim
+++ b/src/hip.nim
@@ -68,21 +68,6 @@ proc hipLaunchKernelGGL*(
   ) {.
     importcpp: "hipLaunchKernelGGL(@)", header: "hip/hip_runtime.h", varargs.}
 
-proc hipModuleLaunchKernel*(
-  function_address: pointer;
-  grid_dim_x: uint32_t;
-  grid_dim_y: uint32_t;
-  grid_dim_z: uint32_t;
-  block_dim_x: uint32_t;
-  block_dim_y: uint32_t;
-  block_dim_z: uint32_t;
-  sharedMemBytes: uint32_t;
-  stream: hipStream_t;
-  kernel_params: ptr pointer;
-  extra: ptr pointer;
-  ): hipError_t {.importcpp: "hipModuleLaunchKernel(@)", header: "hip/hip_runtime.h".}
-
-
 type ConstCString* {.importc: "const char*".} = object
 converter toCString*(self: ConstCString): cstring {.importc: "(char*)", noconv, nodecl.}
 converter toConstCString*(self: cstring): ConstCString {.importc: "(const char*)", noconv, nodecl.}
diff --git a/src/hippo.nim b/src/hippo.nim
index 70b9a1e..d738518 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -140,7 +140,7 @@ template hippoLaunchKernel*(
   blockDim: Dim3 = newDim3(1,1,1),  ## default to 1 thread per block
   sharedMemBytes: uint32 = 0,       ## dynamic shared memory amount to allocate
   stream: HippoStream = nil,        ## Which device stream to run under (defaults to null)
-  args: tuple,                      ## Arguments to pass to the GPU kernel
+  args: tuple,                ## Arguments to pass to the GPU kernel
 ) =
   var result: HippoError
   ## Launch a kernel on the GPU.
@@ -152,7 +152,7 @@ template hippoLaunchKernel*(
   for key, arg in args.fieldPairs:
     let a1 = arg
     kernelArgs.add(cast[pointer](addr a1))
-  when HippoRuntime == "HIP":
+  when HippoRuntime == "HIP" and HipPlatform == "amd":
     result = hipLaunchKernel(
       cast[pointer](kernel),
       gridDim,
@@ -161,22 +161,35 @@ template hippoLaunchKernel*(
       sharedMemBytes,
       stream
     )
+  elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
+    hipLaunchKernelGGL(
+      kernel,
+      gridDim,
+      blockDim,
+      sharedMemBytes,
+      stream,
+      # TODO handle args properly
+      cast[ptr[cint]](args[0]),
+      cast[ptr[cint]](args[1]),
+      cast[ptr[cint]](args[2])
+      )
+    result = hipGetLastError()
   elif HippoRuntime == "HIP_CPU":
-    # I couldn't find a good way to call hipLaunchKernelGGL() with args as a tuple from nim
-    # so I'm using hipModuleLaunchKernel() instead, It's a much simpler interface.
-    result = hipModuleLaunchKernel(
-      cast[pointer](kernel),
-      gridDim.x,
-      gridDim.y,
-      gridDim.z,
-      blockDim.x,
-      blockDim.y,
-      blockDim.z,
+
+    # wonder if it's possible to build a macro to turn kernelArgs to args?
+    # the cpp interface is tricky
+    hipLaunchKernelGGL(
+      kernel,
+      gridDim,
+      blockDim,
       sharedMemBytes,
       stream,
-      nil,
-      cast[ptr pointer](addr kernelArgs[0])
+      # TODO handle args properly
+      args[0],
+      args[1],
+      args[2]
     )
+    result = hipGetLastError()
   elif HippoRuntime == "CUDA":
     result = cudaLaunchKernel(
       kernel,

From f8fce03174a055671f12fef53c918d4d16e0d089 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 21:45:36 -0400
Subject: [PATCH 05/12] fix varargs

---
 src/hippo.nim | 39 +++++++++++++++++++++++++++------------
 1 file changed, 27 insertions(+), 12 deletions(-)

diff --git a/src/hippo.nim b/src/hippo.nim
index d738518..2b4b0ff 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -134,6 +134,29 @@ proc `=destroy`*(mem: var GpuMemory) =
 # -------------------
 # Kernel Execution
 
+macro hipLaunchKernelGGLWithTuple(
+    kernel: proc,
+  gridDim: Dim3 = newDim3(1,1,1),
+  blockDim: Dim3 = newDim3(1,1,1),
+  sharedMemBytes: uint32 = 0,
+  stream: HippoStream = nil,
+  args: tuple
+  ): untyped =
+
+  var callNode = newCall(bindSym"hipLaunchKernelGGL")
+
+  # add the fixed vars
+  callNode.add kernel
+  callNode.add gridDim
+  callNode.add blockDim
+  callNode.add sharedMemBytes
+  callNode.add stream
+
+  # add every value of the tuple
+  for child in args:
+    callNode.add child
+  result = callNode
+
 template hippoLaunchKernel*(
   kernel: proc,                     ## The GPU kernel procedure to launch
   gridDim: Dim3 = newDim3(1,1,1),   ## default to a grid of 1 block
@@ -162,32 +185,24 @@ template hippoLaunchKernel*(
       stream
     )
   elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
-    hipLaunchKernelGGL(
+    hipLaunchKernelGGLWithTuple(
       kernel,
       gridDim,
       blockDim,
       sharedMemBytes,
       stream,
-      # TODO handle args properly
-      cast[ptr[cint]](args[0]),
-      cast[ptr[cint]](args[1]),
-      cast[ptr[cint]](args[2])
+      args
       )
     result = hipGetLastError()
   elif HippoRuntime == "HIP_CPU":
 
-    # wonder if it's possible to build a macro to turn kernelArgs to args?
-    # the cpp interface is tricky
-    hipLaunchKernelGGL(
+    hipLaunchKernelGGLWithTuple(
       kernel,
       gridDim,
       blockDim,
       sharedMemBytes,
       stream,
-      # TODO handle args properly
-      args[0],
-      args[1],
-      args[2]
+      args
     )
     result = hipGetLastError()
   elif HippoRuntime == "CUDA":

From c771ed37fd35829df829422d279993b924f91b0f Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 21:48:55 -0400
Subject: [PATCH 06/12] bump version

---
 hippo.nimble | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/hippo.nimble b/hippo.nimble
index 7ada4ff..348d8c7 100644
--- a/hippo.nimble
+++ b/hippo.nimble
@@ -1,4 +1,4 @@
-version     = "0.5.2"
+version     = "0.5.5"
 author      = "Andrew Brower"
 description = "HIP library for Nim"
 license     = "MIT"

From 83634f9e6cd30db2b5782276c69ed1332e2f5f8a Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:03:28 -0400
Subject: [PATCH 07/12] fixes

---
 src/hippo.nim | 15 ++-------------
 1 file changed, 2 insertions(+), 13 deletions(-)

diff --git a/src/hippo.nim b/src/hippo.nim
index 2b4b0ff..00b8ad3 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -135,7 +135,7 @@ proc `=destroy`*(mem: var GpuMemory) =
 # Kernel Execution
 
 macro hipLaunchKernelGGLWithTuple(
-    kernel: proc,
+  kernel: proc,
   gridDim: Dim3 = newDim3(1,1,1),
   blockDim: Dim3 = newDim3(1,1,1),
   sharedMemBytes: uint32 = 0,
@@ -184,18 +184,7 @@ template hippoLaunchKernel*(
       sharedMemBytes,
       stream
     )
-  elif HippoRuntime == "HIP" and HipPlatform == "nvidia":
-    hipLaunchKernelGGLWithTuple(
-      kernel,
-      gridDim,
-      blockDim,
-      sharedMemBytes,
-      stream,
-      args
-      )
-    result = hipGetLastError()
-  elif HippoRuntime == "HIP_CPU":
-
+  elif (HippoRuntime == "HIP" and HipPlatform == "nvidia") or HippoRuntime == "HIP_CPU":
     hipLaunchKernelGGLWithTuple(
       kernel,
       gridDim,

From 6a2ba81e4ea16d15f8cc74b5e38addf102439111 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:04:45 -0400
Subject: [PATCH 08/12] ensure cpu runtime for vector_sum_cpu

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 743ed86..0b95d2d 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Build Examples
       run: |
-        nim cpp examples/vector_sum_cpu.nim
+        HIP_PLATFORM=HIP_CPU nim cpp examples/vector_sum_cpu.nim
         nim cpp examples/vector_sum_cuda.nim
         nim cpp examples/vector_sum_hippo.nim
         nim cpp examples/vector_sum_hip_amd.nim

From fc8561e0a7148029fb075aa5ede7d0afc1966b00 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:20:25 -0400
Subject: [PATCH 09/12] undo env var

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 0b95d2d..743ed86 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Build Examples
       run: |
-        HIP_PLATFORM=HIP_CPU nim cpp examples/vector_sum_cpu.nim
+        nim cpp examples/vector_sum_cpu.nim
         nim cpp examples/vector_sum_cuda.nim
         nim cpp examples/vector_sum_hippo.nim
         nim cpp examples/vector_sum_hip_amd.nim

From 662419dc6d099867b92b2579054717ffeb9fe4d5 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:22:39 -0400
Subject: [PATCH 10/12] skip cpu example for now

---
 .github/workflows/build.yml | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 743ed86..6c39094 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -35,12 +35,12 @@ jobs:
 
     - name: Build Examples
       run: |
-        nim cpp examples/vector_sum_cpu.nim
+        #nim cpp examples/vector_sum_cpu.nim
         nim cpp examples/vector_sum_cuda.nim
         nim cpp examples/vector_sum_hippo.nim
         nim cpp examples/vector_sum_hip_amd.nim
         #nim cpp examples/vector_sum_hip_nvidia.nim
     
-    - name: Execute CPU Example
-      run: |
-        ./examples/vector_sum_cpu
+    # - name: Execute CPU Example
+    #   run: |
+    #     ./examples/vector_sum_cpu

From 317862222ce17240ede8555349f7a4fd4cc063e8 Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:33:10 -0400
Subject: [PATCH 11/12] oops

---
 .github/workflows/build.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml
index 6c39094..a2a89c2 100644
--- a/.github/workflows/build.yml
+++ b/.github/workflows/build.yml
@@ -35,7 +35,7 @@ jobs:
 
     - name: Build Examples
       run: |
-        #nim cpp examples/vector_sum_cpu.nim
+        nim cpp examples/vector_sum_cpu.nim
         nim cpp examples/vector_sum_cuda.nim
         nim cpp examples/vector_sum_hippo.nim
         nim cpp examples/vector_sum_hip_amd.nim

From bc52fbb2ee2269d3952e1f937306cb45a720ebab Mon Sep 17 00:00:00 2001
From: monofuel <monofuel@japura.net>
Date: Sun, 1 Sep 2024 22:34:53 -0400
Subject: [PATCH 12/12] conditionally define macro

---
 src/hippo.nim | 45 +++++++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/src/hippo.nim b/src/hippo.nim
index 00b8ad3..7471e96 100644
--- a/src/hippo.nim
+++ b/src/hippo.nim
@@ -134,28 +134,29 @@ proc `=destroy`*(mem: var GpuMemory) =
 # -------------------
 # Kernel Execution
 
-macro hipLaunchKernelGGLWithTuple(
-  kernel: proc,
-  gridDim: Dim3 = newDim3(1,1,1),
-  blockDim: Dim3 = newDim3(1,1,1),
-  sharedMemBytes: uint32 = 0,
-  stream: HippoStream = nil,
-  args: tuple
-  ): untyped =
-
-  var callNode = newCall(bindSym"hipLaunchKernelGGL")
-
-  # add the fixed vars
-  callNode.add kernel
-  callNode.add gridDim
-  callNode.add blockDim
-  callNode.add sharedMemBytes
-  callNode.add stream
-
-  # add every value of the tuple
-  for child in args:
-    callNode.add child
-  result = callNode
+when HippoRuntime == "HIP" or HippoRuntime == "HIP_CPU":
+  macro hipLaunchKernelGGLWithTuple(
+    kernel: proc,
+    gridDim: Dim3 = newDim3(1,1,1),
+    blockDim: Dim3 = newDim3(1,1,1),
+    sharedMemBytes: uint32 = 0,
+    stream: HippoStream = nil,
+    args: tuple
+    ): untyped =
+
+    var callNode = newCall(bindSym"hipLaunchKernelGGL")
+
+    # add the fixed vars
+    callNode.add kernel
+    callNode.add gridDim
+    callNode.add blockDim
+    callNode.add sharedMemBytes
+    callNode.add stream
+
+    # add every value of the tuple
+    for child in args:
+      callNode.add child
+    result = callNode
 
 template hippoLaunchKernel*(
   kernel: proc,                     ## The GPU kernel procedure to launch