From 6cda00a6bf0312f82e07608b984fd9e8eb700959 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Gunnar=20Farneb=C3=A4ck?=
 <gunnar.farneback@contextvision.se>
Date: Mon, 17 Jul 2023 16:05:52 +0200
Subject: [PATCH] Update the quickstart documentation.

---
 docs/src/index.md      |  2 +-
 docs/src/quickstart.md | 28 ++++++++++++++--------------
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/docs/src/index.md b/docs/src/index.md
index c241feff..dbf90305 100644
--- a/docs/src/index.md
+++ b/docs/src/index.md
@@ -25,7 +25,7 @@ backend and depend on
 
 ### CUDA
 ```julia
-using CUDA
+import CUDA
 using KernelAbstractions
 ```
 [`CUDA.jl`](https://github.com/JuliaGPU/CUDA.jl) is currently the most mature way to program for GPUs.
diff --git a/docs/src/quickstart.md b/docs/src/quickstart.md
index f93b14e7..73d9e74a 100644
--- a/docs/src/quickstart.md
+++ b/docs/src/quickstart.md
@@ -46,33 +46,32 @@ The [`synchronize`](@ref) blocks the *host* until the kernel has completed on th
 ## Launching kernel on the backend
 
 To launch the kernel on a backend-supported backend `isa(backend, KA.GPU)` (e.g., `CUDABackend()`, `ROCBackend()`, `oneBackend()`), we generate the kernel
-for this backend provided by `CUDAKernels`, `ROCKernels`, or `oneAPIKernels`.
+for this backend.
 
 First, we initialize the array using the Array constructor of the chosen backend with
 
 ```julia
-using CUDAKernels # Required to access CUDABackend
+using CUDA: CuArray
 A = CuArray(ones(1024, 1024))
 ```
 
 ```julia
-using ROCKernels # Required to access ROCBackend
+using ROCArrays: ROCArray
 A = ROCArray(ones(1024, 1024))
 ```
 
 ```julia
-using oneAPIKernels # Required to access oneBackend
+using oneAPI: oneArray
 A = oneArray(ones(1024, 1024))
 ```
 The kernel generation and execution are then
 ```julia
+backend = get_backend(A)
 mul2_kernel(backend, 64)(A, ndrange=size(A))
 synchronize(backend)
 all(A .== 2.0)
 ```
 
-For simplicity, we stick with the case of `backend=CUDABackend()`.
-
 ## Synchronization
 !!! danger
     All kernel launches are asynchronous, use [`synchronize(backend)`](@ref)
@@ -82,23 +81,24 @@ The code around KA may heavily rely on
 [`GPUArrays`](https://github.com/JuliaGPU/GPUArrays.jl), for example, to
 intialize variables.
 ```julia
-using CUDAKernels # Required to access CUDABackend
-function mymul(A::CuArray)
+function mymul(A)
     A .= 1.0
-    ev = mul2_kernel(CUDABackend(), 64)(A, ndrange=size(A))
+    backend = get_backend(A)
+    ev = mul2_kernel(backend, 64)(A, ndrange=size(A))
     synchronize(backend)
     all(A .== 2.0)
 end
 ```
 
 ```julia
-using CUDAKernels # Required to access CUDABackend
-function mymul(A::CuArray, B::CuArray)
+function mymul(A, B)
     A .= 1.0
     B .= 3.0
-    mul2_kernel(CUDABackend(), 64)(A, ndrange=size(A))
-    mul2_kernel(CUDABackend(), 64)(A, ndrange=size(A))
-    synchronize(CUDABackend())
+    backend = get_backend(A)
+    @assert get_backend(B) == backend
+    mul2_kernel(backend, 64)(A, ndrange=size(A))
+    mul2_kernel(backend, 64)(B, ndrange=size(B))
+    synchronize(backend)
     all(A .+ B .== 8.0)
 end
 ```