Re-use pre-converted kernel arguments when launching kernels. #2472

maleadt · 2024-08-22T14:33:43Z

Addresses part of #2456, by avoiding a redundant cudaconvert call on kernel launches. The idea is that, instead of first calling cudaconvert + typeof to drive compilation, and then for the kernel's arguments when doing the actual launch (see below for why this split is necessary), we only call cudaconvert at launch time when there's a mismatch between an argument's type and what the kernel was compiled for. This makes it possible for @cuda to forward the already-converted arguments to the kernel, avoiding a secondary conversion at launch time.

Note that the whole distinction between compile-time and launch-time cudaconvert exists to make it possible to do the following:

k = @cuda launch=false kernel(args)
k(args)

i.e., to make launch=false user-friendly and let users pass non-converted arguments into a pre-compiled kernel, the API has to call cudaconvert twice, once when compiling as part of @cuda launch=false, and once when calling the kernel. The alternative would be to have users explicitly call cudaconvert(args) when launching the kernel, but I didn't want to expose the implementation detail that arguments are converted.

cc @vchuravy

@pxl-th @tgymnich This pattern probably applies to other GPU back-ends as well. CUDA.jl's cudaconvert is somewhat expensive because of our automatic memory tracking (https://juliagpu.org/post/2024-05-28-cuda_5.4/#tracked_memory_allocations,

CUDA.jl/src/memory.jl

Lines 494 to 598 in 76e2972

    
           ## managed memory 
        
           # to safely use allocated memory across tasks and devices, we don't simply return raw 
        
           # memory objects, but wrap them in a manager that ensures synchronization and ownership. 
        
           # XXX: immutable with atomic refs? 
        
           mutable struct Managed{M} 
        
             const mem::M 
        
             # which stream is currently using the memory. 
        
             stream::CuStream 
        
             # whether there are outstanding operations that haven't been synchronized 
        
             dirty::Bool 
        
             # whether the memory has been captured in a way that would make the dirty bit unreliable 
        
             captured::Bool 
        
             function Managed(mem::AbstractMemory; stream=CUDA.stream(), dirty=true, captured=false) 
        
               # NOTE: memory starts as dirty, because stream-ordered allocations are only 
        
               #       guaranteed to be physically allocated at a synchronization event. 
        
               new{typeof(mem)}(mem, stream, dirty, captured) 
        
             end 
        
           end 
        
           # wait for the current owner of memory to finish processing 
        
           function synchronize(managed::Managed) 
        
             synchronize(managed.stream) 
        
             managed.dirty = false 
        
           end 
        
           function maybe_synchronize(managed::Managed) 
        
             if managed.dirty || managed.captured 
        
               synchronize(managed) 
        
             end 
        
           end 
        
           function Base.convert(::Type{CuPtr{T}}, managed::Managed{M}) where {T,M} 
        
             # let null pointers pass through as-is 
        
             ptr = convert(CuPtr{T}, managed.mem) 
        
             if ptr == CU_NULL 
        
               return ptr 
        
             end 
        
             # accessing memory during stream capture: taint the memory so that we always synchronize 
        
             state = active_state() 
        
             if is_capturing(state.stream) 
        
               managed.captured = true 
        
             end 
        
             # accessing memory on another device: ensure the data is ready and accessible 
        
             if M == DeviceMemory && state.context != managed.mem.ctx 
        
               maybe_synchronize(managed) 
        
               source_device = managed.mem.dev 
        
               # enable peer-to-peer access 
        
               if maybe_enable_peer_access(state.device, source_device) != 1 
        
                   throw(ArgumentError( 
        
                       """cannot take the GPU address of inaccessible device memory. 
        
                          You are trying to use memory from GPU $(deviceid(source_device)) on GPU $(deviceid(state.device)). 
        
                          P2P access between these devices is not possible; either switch to GPU $(deviceid(source_device)) 
        
                          by calling `CUDA.device!($(deviceid(source_device)))`, or copy the data to an array allocated on device $(deviceid(state.device)).""")) 
        
               end 
        
               # set pool visibility 
        
               if stream_ordered(source_device) 
        
                 pool = pool_create(source_device) 
        
                 access!(pool, state.device, ACCESS_FLAGS_PROT_READWRITE) 
        
               end 
        
             end 
        
             # accessing memory on another stream: ensure the data is ready and take ownership 
        
             if managed.stream != state.stream 
        
               maybe_synchronize(managed) 
        
               managed.stream = state.stream 
        
             end 
        
             managed.dirty = true 
        
             return ptr 
        
           end 
        
           function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T,M} 
        
             # let null pointers pass through as-is 
        
             ptr = convert(Ptr{T}, managed.mem) 
        
             if ptr == C_NULL 
        
               return ptr 
        
             end 
        
             # accessing memory on the CPU: only allowed for host or unified allocations 
        
             if M == DeviceMemory 
        
               throw(ArgumentError( 
        
                   """cannot take the CPU address of GPU memory. 
        
                      You are probably falling back to or otherwise calling CPU functionality 
        
                      with GPU array inputs. This is not supported by regular device memory; 
        
                      ensure this operation is supported by CUDA.jl, and if it isn't, try to 
        
                      avoid it or rephrase it in terms of supported operations. Alternatively, 
        
                      you can consider using GPU arrays backed by unified memory by 
        
                      allocating using `cu(...; unified=true)`.""")) 
        
             end 
        
             # make sure any work on the memory has finished. 
        
             maybe_synchronize(managed) 
        
             return ptr 
        
           end

)

src/compiler/execution.jl

maleadt · 2024-08-26T11:03:13Z

I wonder if alternatively, to avoid the questionable conditional call to cudaconvert, we should just use a typevar in the AbstractKernel indicating whether arguments ought to be converted or not (only setting that to false when re-using the args in the context of @cuda-generated code).

codecov · 2024-09-18T12:23:57Z

Codecov Report

Attention: Patch coverage is 88.88889% with 1 line in your changes missing coverage. Please review.

Project coverage is 59.76%. Comparing base (cdae2d3) to head (e4acead).
Report is 4 commits behind head on master.

Files with missing lines	Patch %	Lines
src/compiler/execution.jl	88.88%	1 Missing ⚠️

Additional details and impacted files

@@             Coverage Diff             @@
##           master    #2472       +/-   ##
===========================================
- Coverage   73.17%   59.76%   -13.42%     
===========================================
  Files         157      157               
  Lines       14960    14708      -252     
===========================================
- Hits        10947     8790     -2157     
- Misses       4013     5918     +1905

☔ View full report in Codecov by Sentry.
📢 Have feedback on the report? Share it here.

maleadt added performance How fast can we go? speculative Not sure about this one yet. labels Aug 22, 2024

maleadt mentioned this pull request Aug 22, 2024

CUDA.jl CI broken again because of Enzyme EnzymeAD/Enzyme.jl#1745

Closed

wsmoses reviewed Aug 22, 2024

View reviewed changes

src/compiler/execution.jl Outdated Show resolved Hide resolved

maleadt force-pushed the tb/launch_latency branch 2 times, most recently from 31ad8e1 to d53854d Compare August 23, 2024 14:00

tgymnich mentioned this pull request Aug 23, 2024

Remove redundant calls to mtlconvert JuliaGPU/Metal.jl#409

Closed

wsmoses reviewed Aug 24, 2024

View reviewed changes

src/compiler/execution.jl Outdated Show resolved Hide resolved

maleadt marked this pull request as draft September 18, 2024 08:39

maleadt force-pushed the tb/launch_latency branch from d53854d to 3106301 Compare September 18, 2024 09:05

maleadt changed the title ~~Only call cudaconvert when argument types mismatch.~~ Re-use pre-converted kernel arguments when launching kernels. Sep 18, 2024

maleadt marked this pull request as ready for review September 18, 2024 09:06

Re-use pre-converted kernel arguments when launching kernels.

e4acead

maleadt force-pushed the tb/launch_latency branch from 3106301 to e4acead Compare September 18, 2024 09:06

maleadt merged commit d5dcffa into master Sep 18, 2024
1 check was pending

maleadt deleted the tb/launch_latency branch September 18, 2024 12:09

wsmoses mentioned this pull request Sep 19, 2024

Enzyme: adapt to pending version breaking update #2490

Merged

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Re-use pre-converted kernel arguments when launching kernels. #2472

Re-use pre-converted kernel arguments when launching kernels. #2472

maleadt commented Aug 22, 2024

maleadt commented Aug 26, 2024

codecov bot commented Sep 18, 2024

	## managed memory

	# to safely use allocated memory across tasks and devices, we don't simply return raw
	# memory objects, but wrap them in a manager that ensures synchronization and ownership.

	# XXX: immutable with atomic refs?
	mutable struct Managed{M}
	const mem::M

	# which stream is currently using the memory.
	stream::CuStream

	# whether there are outstanding operations that haven't been synchronized
	dirty::Bool

	# whether the memory has been captured in a way that would make the dirty bit unreliable
	captured::Bool

	function Managed(mem::AbstractMemory; stream=CUDA.stream(), dirty=true, captured=false)
	# NOTE: memory starts as dirty, because stream-ordered allocations are only
	# guaranteed to be physically allocated at a synchronization event.
	new{typeof(mem)}(mem, stream, dirty, captured)
	end
	end

	# wait for the current owner of memory to finish processing
	function synchronize(managed::Managed)
	synchronize(managed.stream)
	managed.dirty = false
	end
	function maybe_synchronize(managed::Managed)
	if managed.dirty \|\| managed.captured
	synchronize(managed)
	end
	end

	function Base.convert(::Type{CuPtr{T}}, managed::Managed{M}) where {T,M}
	# let null pointers pass through as-is
	ptr = convert(CuPtr{T}, managed.mem)
	if ptr == CU_NULL
	return ptr
	end

	# accessing memory during stream capture: taint the memory so that we always synchronize
	state = active_state()
	if is_capturing(state.stream)
	managed.captured = true
	end

	# accessing memory on another device: ensure the data is ready and accessible
	if M == DeviceMemory && state.context != managed.mem.ctx
	maybe_synchronize(managed)
	source_device = managed.mem.dev

	# enable peer-to-peer access
	if maybe_enable_peer_access(state.device, source_device) != 1
	throw(ArgumentError(
	"""cannot take the GPU address of inaccessible device memory.

	You are trying to use memory from GPU $(deviceid(source_device)) on GPU $(deviceid(state.device)).
	P2P access between these devices is not possible; either switch to GPU $(deviceid(source_device))
	by calling `CUDA.device!($(deviceid(source_device)))`, or copy the data to an array allocated on device $(deviceid(state.device))."""))
	end

	# set pool visibility
	if stream_ordered(source_device)
	pool = pool_create(source_device)
	access!(pool, state.device, ACCESS_FLAGS_PROT_READWRITE)
	end
	end

	# accessing memory on another stream: ensure the data is ready and take ownership
	if managed.stream != state.stream
	maybe_synchronize(managed)
	managed.stream = state.stream
	end

	managed.dirty = true
	return ptr
	end

	function Base.convert(::Type{Ptr{T}}, managed::Managed{M}) where {T,M}
	# let null pointers pass through as-is
	ptr = convert(Ptr{T}, managed.mem)
	if ptr == C_NULL
	return ptr
	end

	# accessing memory on the CPU: only allowed for host or unified allocations
	if M == DeviceMemory
	throw(ArgumentError(
	"""cannot take the CPU address of GPU memory.

	You are probably falling back to or otherwise calling CPU functionality
	with GPU array inputs. This is not supported by regular device memory;
	ensure this operation is supported by CUDA.jl, and if it isn't, try to
	avoid it or rephrase it in terms of supported operations. Alternatively,
	you can consider using GPU arrays backed by unified memory by
	allocating using `cu(...; unified=true)`."""))
	end

	# make sure any work on the memory has finished.
	maybe_synchronize(managed)
	return ptr
	end

Re-use pre-converted kernel arguments when launching kernels. #2472

Re-use pre-converted kernel arguments when launching kernels. #2472

Conversation

maleadt commented Aug 22, 2024

maleadt commented Aug 26, 2024

codecov bot commented Sep 18, 2024

Codecov Report