diff --git a/.github/workflows/build.yml b/.github/workflows/build.yml index bc27c22..eeb0c94 100644 --- a/.github/workflows/build.yml +++ b/.github/workflows/build.yml @@ -12,7 +12,7 @@ jobs: - uses: actions/checkout@v3 with: submodules: recursive - - uses: jiro4989/setup-nim-action@v1 + - uses: jiro4989/setup-nim-action@v2 with: repo-token: ${{ secrets.GITHUB_TOKEN }} - name: Install dependencies diff --git a/src/cuda.nim b/src/cuda.nim index 9b62313..3c4c561 100644 --- a/src/cuda.nim +++ b/src/cuda.nim @@ -1,5 +1,5 @@ # CUDA runtime C++ FFI -import std/strformat +import std/strformat, macros type size_t* = uint64 @@ -59,6 +59,8 @@ proc cudaLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Di # args: ptr pointer; sharedMemBytes: csize_t; stream: cudaStream_t): cint {. # importcpp: "cudaLaunchKernel(@)", header: "cuda_runtime.h".} proc cudaDeviceSynchronize*(): cudaError_t {.header: "cuda_runtime.h",importcpp: "cudaDeviceSynchronize(@)".} +proc cudaSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} +proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "cuda_runtime.h".} proc cudaLaunchKernelGGL*( function_address: proc; @@ -103,17 +105,36 @@ let gridDim* {.importc, inject, header: "cuda_runtime.h".}: GridDim threadIdx* {.importc, inject, header: "cuda_runtime.h".}: ThreadIdx -template hippoGlobal*(body: untyped) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__global__ $# $#$#".} - body - {.pop} - -template hippoDevice*(body: typed) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__device__ $# $#$#".} - body - {.pop} - -template hippoHost*(body: typed) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__host__ $# $#$#".} - body - {.pop} \ No newline at end of file +macro hippoGlobal*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__global__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} + +macro hippoDevice*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__device__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} + + +macro hippoHost*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__host__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} diff --git a/src/hip.nim b/src/hip.nim index f90415e..bd13604 100644 --- a/src/hip.nim +++ b/src/hip.nim @@ -1,5 +1,5 @@ # HIP runtime C++ FFI -import std/strformat +import std/strformat, macros type size_t* = uint64 @@ -56,6 +56,8 @@ proc hipLaunchKernel*(function_address: pointer; numBlocks: Dim3; dimBlocks: Dim # args: ptr pointer; sharedMemBytes: csize_t; stream: hipStream_t): cint {. # importcpp: "hipLaunchKernel(@)", header: "hip/hip_runtime.h".} proc hipDeviceSynchronize*(): hipError_t {.header: "hip/hip_runtime.h",importcpp: "hipDeviceSynchronize(@)".} +proc hipSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} +proc hippoSyncthreads*() {.importcpp: "__syncthreads()", header: "hip/hip_runtime.h".} proc hipLaunchKernelGGL*( function_address: proc; @@ -99,17 +101,36 @@ let gridDim* {.importc, inject, header: "hip/hip_runtime.h".}: GridDim threadIdx* {.importc, inject, header: "hip/hip_runtime.h".}: ThreadIdx -template hippoGlobal*(body: untyped) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__global__ $# $#$#".} - body - {.pop} - -template hippoDevice*(body: typed) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__device__ $# $#$#".} - body - {.pop} - -template hippoHost*(body: typed) = - {.push stackTrace: off, checks: off, exportc, codegenDecl: "__host__ $# $#$#".} - body - {.pop} \ No newline at end of file +macro hippoGlobal*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__global__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} + +macro hippoDevice*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__device__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} + + +macro hippoHost*(fn: untyped): untyped = + let globalPragma: NimNode = quote: + {. exportc, codegenDecl: "__host__ $# $#$#".} + + fn.addPragma(globalPragma[0]) + fn.addPragma(globalPragma[1]) + quote do: + {.push stackTrace: off, checks: off.} + `fn` + {.pop.} diff --git a/src/hippo.nim b/src/hippo.nim index 2cdb741..367175b 100644 --- a/src/hippo.nim +++ b/src/hippo.nim @@ -83,7 +83,7 @@ proc launchKernel*( kernel: proc, gridDim: Dim3 = newDim3(1,1,1), # default to a grid of 1 block blockDim: Dim3 = newDim3(1,1,1), # default to 1 thread per block - sharedMemBytes: uint32 = 0, + sharedMemBytes: uint32 = 0, # TODO dynamic shared memory stream: HippoStream = nil, args: tuple ): HippoError = diff --git a/tests/hip/dot.nim b/tests/hip/dot.nim new file mode 100644 index 0000000..476d6c9 --- /dev/null +++ b/tests/hip/dot.nim @@ -0,0 +1,95 @@ +import hippo + +# GPU Dot product + +const + N = 33 * 1024 + ThreadsPerBlock: int = 256 + BlocksPerGrid: int = min(32, ((N + ThreadsPerBlock - 1) div ThreadsPerBlock)) + +# TODO improve this +{.pragma: hippoShared, exportc, codegenDecl: "__shared__ $# $#".} + +proc dot(a, b, c: ptr[cfloat]){.hippoGlobal.} = + + var cache {.hippoShared.}: ptr[cfloat] + + # TODO figure out how to do this properly + let aArray = cast[ptr UncheckedArray[cfloat]](a) + let bArray = cast[ptr UncheckedArray[cfloat]](b) + let cArray = cast[ptr UncheckedArray[cfloat]](c) + let cacheArray = cast[ptr UncheckedArray[cfloat]](cache) + + let cacheIndex = threadIdx.x + var tid = threadIdx.x + blockIdx.x * blockDim.x + var temp: cfloat = 0 + while tid < N: + temp += aArray[tid] * bArray[tid] + tid += blockDim.x * gridDim.x + + # set the cache values + cacheArray[cacheIndex] = temp + + # synchronize threads in this block + hippoSyncthreads() + + # for reductions, threadsPerBlock must be a power of 2 + # because of the following code + var i = blockDim.x div 2 + while i != 0: + if cacheIndex < i: + cacheArray[cacheIndex] += cacheArray[cacheIndex + i] + hippoSyncthreads() + i = i div 2 + + if cacheIndex == 0: + cArray[blockIdx.x] = cacheArray[0] + + +proc main() = + var a, b, partial_c: array[N, int32] + var dev_a, dev_b, dev_partial_c: pointer + + # allocate gpu memory + handleError(hipMalloc(addr dev_a, sizeof(float)*N)) + handleError(hipMalloc(addr dev_b, sizeof(float)*N)) + handleError(hipMalloc(addr dev_partial_c, BlocksPerGrid * sizeof(float))) + + # fill in host memory with data + for i in 0 ..< N: + a[i] = i.int32 + b[i] = (i * 2).int32 + + # copy data to device + handleError(hipMemcpy(dev_a, addr a[0], sizeof(float)*N, hipMemcpyHostToDevice)) + handleError(hipMemcpy(dev_b, addr b[0], sizeof(float)*N, hipMemcpyHostToDevice)) + + # launch kernel + handleError(launchKernel( + dot, + gridDim = newDim3(BlocksPerGrid.uint32), + blockDim = newDim3(ThreadsPerBlock.uint32), + args = (dev_a, dev_b, dev_partial_c) + )) + + # copy memory back from GPU to CPU + handleError(hipMemcpy(addr partial_c[0], dev_partial_c, BlocksPerGrid * sizeof(float), hipMemcpyDeviceToHost)) + + # finish up on the CPU + var c: int32 = 0 + for i in 0 ..< BlocksPerGrid: + c += partial_c[i] + + + proc sum_squares(x: float): float = + result = x * (x + 1) * (2 * x + 1) / 6 + + echo "Does GPU value ", c.float, " = ", 2 * sum_squares((N - 1)), "?" + + handleError(hipFree(dev_a)) + handleError(hipFree(dev_b)) + handleError(hipFree(dev_partial_c)) + + +when isMainModule: + main() \ No newline at end of file