diff --git a/experimental/web/sample_webgpu/main.c b/experimental/web/sample_webgpu/main.c index 8f7d4d4b2dda..aaf8336927ca 100644 --- a/experimental/web/sample_webgpu/main.c +++ b/experimental/web/sample_webgpu/main.c @@ -666,6 +666,10 @@ static iree_status_t allocate_mappable_device_buffer( "unable to allocate buffer of size %" PRIdsz, data_length); } + const iree_hal_buffer_placement_t placement = { + .device = device, + .queue_affinity = IREE_HAL_QUEUE_AFFINITY_ANY, + }; const iree_hal_buffer_params_t target_params = { .usage = IREE_HAL_BUFFER_USAGE_TRANSFER | IREE_HAL_BUFFER_USAGE_MAPPING, .type = @@ -673,8 +677,8 @@ static iree_status_t allocate_mappable_device_buffer( .access = IREE_HAL_MEMORY_ACCESS_ALL, }; return iree_hal_webgpu_buffer_wrap( - device, iree_hal_device_allocator(device), target_params.type, - target_params.access, target_params.usage, data_length, + origin, target_params.type, target_params.access, target_params.usage, + data_length, /*byte_offset=*/0, /*byte_length=*/data_length, device_buffer_handle, iree_allocator_system(), out_buffer); diff --git a/experimental/webgpu/buffer.c b/experimental/webgpu/buffer.c index cd83587a7be1..564393f87229 100644 --- a/experimental/webgpu/buffer.c +++ b/experimental/webgpu/buffer.c @@ -19,7 +19,7 @@ typedef struct iree_hal_webgpu_buffer_t { iree_hal_buffer_t base; - iree_hal_device_t* device; // unowned + iree_allocator_t host_allocator; WGPUBuffer handle; bool is_mapped; } iree_hal_webgpu_buffer_t; @@ -33,14 +33,12 @@ static iree_hal_webgpu_buffer_t* iree_hal_webgpu_buffer_cast( } iree_status_t iree_hal_webgpu_buffer_wrap( - iree_hal_device_t* device, iree_hal_allocator_t* device_allocator, - iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, WGPUBuffer handle, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(device); - IREE_ASSERT_ARGUMENT(device_allocator); IREE_ASSERT_ARGUMENT(handle); IREE_ASSERT_ARGUMENT(out_buffer); *out_buffer = NULL; @@ -50,11 +48,11 @@ iree_status_t iree_hal_webgpu_buffer_wrap( iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, device_allocator, &buffer->base, - allocation_size, byte_offset, byte_length, - memory_type, allowed_access, allowed_usage, + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, + byte_offset, byte_length, memory_type, + allowed_access, allowed_usage, &iree_hal_webgpu_buffer_vtable, &buffer->base); - buffer->device = device; + buffer->host_allocator = host_allocator; buffer->handle = handle; *out_buffer = &buffer->base; } @@ -65,7 +63,7 @@ iree_status_t iree_hal_webgpu_buffer_wrap( static void iree_hal_webgpu_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); if (buffer->is_mapped) { @@ -99,7 +97,7 @@ static iree_status_t iree_hal_webgpu_buffer_map_range( // Use wgpuBufferMapAsync directly to avoid this emulation. iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer); return iree_hal_buffer_emulated_map_range( - buffer->device, base_buffer, mapping_mode, memory_access, + buffer->placement.device, base_buffer, mapping_mode, memory_access, local_byte_offset, local_byte_length, mapping); } @@ -109,8 +107,8 @@ static iree_status_t iree_hal_webgpu_buffer_unmap_range( // WebGPU does not allow for synchronous buffer mapping. // Use wgpuBufferMapAsync directly to avoid this emulation. iree_hal_webgpu_buffer_t* buffer = iree_hal_webgpu_buffer_cast(base_buffer); - return iree_hal_buffer_emulated_unmap_range(buffer->device, base_buffer, - local_byte_offset, + return iree_hal_buffer_emulated_unmap_range(buffer->placement.device, + base_buffer, local_byte_offset, local_byte_length, mapping); } diff --git a/experimental/webgpu/buffer.h b/experimental/webgpu/buffer.h index 056185d62d6b..c837753bd7a4 100644 --- a/experimental/webgpu/buffer.h +++ b/experimental/webgpu/buffer.h @@ -19,8 +19,8 @@ extern "C" { // we start to support pooling. iree_status_t iree_hal_webgpu_buffer_wrap( - iree_hal_device_t* device, iree_hal_allocator_t* device_allocator, - iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, WGPUBuffer handle, iree_allocator_t host_allocator, diff --git a/experimental/webgpu/simple_allocator.c b/experimental/webgpu/simple_allocator.c index d6016a1ca8f5..ac0cc072973f 100644 --- a/experimental/webgpu/simple_allocator.c +++ b/experimental/webgpu/simple_allocator.c @@ -195,9 +195,14 @@ static iree_status_t iree_hal_webgpu_simple_allocator_allocate_buffer( allocation_size); } + const iree_hal_buffer_placement_t placement = { + .device = allocator->device, + .queue_affinity = params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; iree_status_t status = iree_hal_webgpu_buffer_wrap( - allocator->device, base_allocator, params->type, params->access, - params->usage, allocation_size, + placement, params->type, params->access, params->usage, allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, buffer_handle, allocator->host_allocator, out_buffer); diff --git a/runtime/src/iree/hal/allocator.h b/runtime/src/iree/hal/allocator.h index de0ec30e8d78..7698b2e17113 100644 --- a/runtime/src/iree/hal/allocator.h +++ b/runtime/src/iree/hal/allocator.h @@ -495,6 +495,8 @@ IREE_HAL_ASSERT_VTABLE_LAYOUT(iree_hal_allocator_vtable_t); IREE_API_EXPORT void iree_hal_allocator_destroy( iree_hal_allocator_t* IREE_RESTRICT allocator); +// TODO(#19159): remove iree_hal_allocator_deallocate_buffer when pooling no +// longer requires the pooling_allocator on iree_hal_buffer_t. IREE_API_EXPORT void iree_hal_allocator_deallocate_buffer( iree_hal_allocator_t* IREE_RESTRICT allocator, iree_hal_buffer_t* IREE_RESTRICT buffer); diff --git a/runtime/src/iree/hal/allocator_heap.c b/runtime/src/iree/hal/allocator_heap.c index 2008de4c3d5e..e7ed9bbdb06f 100644 --- a/runtime/src/iree/hal/allocator_heap.c +++ b/runtime/src/iree/hal/allocator_heap.c @@ -198,8 +198,8 @@ static iree_status_t iree_hal_heap_allocator_allocate_buffer( IREE_STATISTICS(statistics = &allocator->statistics); iree_hal_buffer_t* buffer = NULL; IREE_RETURN_IF_ERROR(iree_hal_heap_buffer_create( - base_allocator, statistics, &compat_params, allocation_size, - allocator->data_allocator, allocator->host_allocator, &buffer)); + statistics, &compat_params, allocation_size, allocator->data_allocator, + allocator->host_allocator, &buffer)); *out_buffer = buffer; return iree_ok_status(); @@ -219,6 +219,9 @@ static iree_status_t iree_hal_heap_allocator_import_buffer( iree_hal_external_buffer_t* IREE_RESTRICT external_buffer, iree_hal_buffer_release_callback_t release_callback, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { + iree_hal_heap_allocator_t* allocator = + iree_hal_heap_allocator_cast(base_allocator); + // Coerce options into those required for use by heap-based devices. iree_hal_buffer_params_t compat_params = *params; iree_device_size_t allocation_size = external_buffer->size; @@ -243,11 +246,17 @@ static iree_status_t iree_hal_heap_allocator_import_buffer( "external buffer type not supported"); } + const iree_hal_buffer_placement_t placement = { + .device = NULL, + .queue_affinity = compat_params.queue_affinity + ? compat_params.queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; return iree_hal_heap_buffer_wrap( - base_allocator, compat_params.type, compat_params.access, - compat_params.usage, external_buffer->size, - iree_make_byte_span(ptr, external_buffer->size), release_callback, - out_buffer); + placement, compat_params.type, compat_params.access, compat_params.usage, + external_buffer->size, iree_make_byte_span(ptr, external_buffer->size), + release_callback, allocator->host_allocator, out_buffer); } static iree_status_t iree_hal_heap_allocator_export_buffer( diff --git a/runtime/src/iree/hal/buffer.c b/runtime/src/iree/hal/buffer.c index 6b3327618d2a..a55f2482d584 100644 --- a/runtime/src/iree/hal/buffer.c +++ b/runtime/src/iree/hal/buffer.c @@ -122,49 +122,34 @@ IREE_API_EXPORT iree_string_view_t iree_hal_buffer_usage_format( // Subspan indirection buffer //===----------------------------------------------------------------------===// -static const iree_hal_buffer_vtable_t iree_hal_subspan_buffer_vtable; - -IREE_API_EXPORT void iree_hal_subspan_buffer_initialize( - iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator, - iree_allocator_t host_allocator, iree_hal_buffer_t* out_buffer) { - IREE_ASSERT_ARGUMENT(allocated_buffer); - IREE_ASSERT_ARGUMENT(out_buffer); - iree_hal_buffer_initialize(host_allocator, device_allocator, allocated_buffer, - allocated_buffer->allocation_size, byte_offset, - byte_length, allocated_buffer->memory_type, - allocated_buffer->allowed_access, - allocated_buffer->allowed_usage, - &iree_hal_subspan_buffer_vtable, out_buffer); -} +typedef struct iree_hal_subspan_buffer_t { + iree_hal_buffer_t base; + iree_allocator_t host_allocator; +} iree_hal_subspan_buffer_t; -IREE_API_EXPORT void iree_hal_subspan_buffer_deinitialize( - iree_hal_buffer_t* buffer) { - IREE_ASSERT_ARGUMENT(buffer); - iree_hal_buffer_release(buffer->allocated_buffer); - buffer->allocated_buffer = NULL; -} +static const iree_hal_buffer_vtable_t iree_hal_subspan_buffer_vtable; IREE_API_EXPORT iree_status_t iree_hal_subspan_buffer_create( iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator, - iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + iree_device_size_t byte_length, iree_allocator_t host_allocator, + iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(allocated_buffer); IREE_ASSERT_ARGUMENT(out_buffer); *out_buffer = NULL; IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_buffer_t* buffer = NULL; + iree_hal_subspan_buffer_t* buffer = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { iree_hal_buffer_initialize( - host_allocator, device_allocator, allocated_buffer, + iree_hal_buffer_placement_undefined(), allocated_buffer, allocated_buffer->allocation_size, byte_offset, byte_length, allocated_buffer->memory_type, allocated_buffer->allowed_access, allocated_buffer->allowed_usage, &iree_hal_subspan_buffer_vtable, - buffer); - *out_buffer = buffer; + &buffer->base); + buffer->host_allocator = host_allocator; + *out_buffer = &buffer->base; } IREE_TRACE_ZONE_END(z0); @@ -172,11 +157,12 @@ IREE_API_EXPORT iree_status_t iree_hal_subspan_buffer_create( } static void iree_hal_subspan_buffer_destroy(iree_hal_buffer_t* base_buffer) { - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_hal_subspan_buffer_t* buffer = (iree_hal_subspan_buffer_t*)base_buffer; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); iree_hal_buffer_release(base_buffer->allocated_buffer); - iree_allocator_free(host_allocator, base_buffer); + iree_allocator_free(host_allocator, buffer); IREE_TRACE_ZONE_END(z0); } @@ -221,162 +207,19 @@ static const iree_hal_buffer_vtable_t iree_hal_subspan_buffer_vtable = { .flush_range = iree_hal_subspan_buffer_flush_range, }; -//===----------------------------------------------------------------------===// -// iree_hal_deferred_buffer_t -//===----------------------------------------------------------------------===// - -typedef struct iree_hal_deferred_buffer_t { - iree_hal_buffer_t base; - iree_hal_queue_affinity_t queue_affinity; - iree_device_size_t min_alignment; -} iree_hal_deferred_buffer_t; - -static const iree_hal_buffer_vtable_t iree_hal_deferred_buffer_vtable; - -IREE_API_EXPORT iree_status_t iree_hal_deferred_buffer_create_reserved( - iree_hal_allocator_t* device_allocator, iree_device_size_t allocation_size, - iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_buffer_params_t params, iree_allocator_t host_allocator, - iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(out_buffer); - *out_buffer = NULL; - IREE_TRACE_ZONE_BEGIN(z0); - - iree_hal_deferred_buffer_t* buffer = NULL; - IREE_RETURN_AND_END_ZONE_IF_ERROR( - z0, - iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer)); - iree_hal_buffer_initialize(host_allocator, device_allocator, NULL, - allocation_size, byte_offset, byte_length, - params.type, params.access, params.usage, - &iree_hal_deferred_buffer_vtable, &buffer->base); - buffer->queue_affinity = params.queue_affinity; - buffer->min_alignment = params.min_alignment; - *out_buffer = &buffer->base; - - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); -} - -static void iree_hal_deferred_buffer_destroy(iree_hal_buffer_t* base_buffer) { - iree_allocator_t host_allocator = base_buffer->host_allocator; - IREE_TRACE_ZONE_BEGIN(z0); - - if (base_buffer->allocated_buffer) { - iree_hal_buffer_release(base_buffer->allocated_buffer); - } - iree_allocator_free(host_allocator, base_buffer); - - IREE_TRACE_ZONE_END(z0); -} - -IREE_API_EXPORT iree_status_t -iree_hal_deferred_buffer_commit(iree_hal_buffer_t* base_buffer) { - iree_hal_deferred_buffer_t* buffer = (iree_hal_deferred_buffer_t*)base_buffer; - if (IREE_UNLIKELY(base_buffer->allocated_buffer)) { - // Already committed - no-op. - return iree_ok_status(); - } - IREE_TRACE_ZONE_BEGIN(z0); - iree_hal_buffer_params_t params = { - .usage = base_buffer->allowed_usage, - .access = base_buffer->allowed_access, - .type = base_buffer->memory_type, - .queue_affinity = buffer->queue_affinity, - .min_alignment = buffer->min_alignment, - }; - iree_status_t status = iree_hal_allocator_allocate_buffer( - base_buffer->device_allocator, params, base_buffer->allocation_size, - &base_buffer->allocated_buffer); - IREE_TRACE_ZONE_END(z0); - return status; -} - -IREE_API_EXPORT iree_status_t -iree_hal_deferred_buffer_decommit(iree_hal_buffer_t* buffer) { - IREE_TRACE_ZONE_BEGIN(z0); - if (IREE_LIKELY(buffer->allocated_buffer)) { - iree_hal_buffer_release(buffer->allocated_buffer); - buffer->allocated_buffer = NULL; - } - IREE_TRACE_ZONE_END(z0); - return iree_ok_status(); -} - -static iree_status_t iree_hal_deferred_buffer_map_range( - iree_hal_buffer_t* buffer, iree_hal_mapping_mode_t mapping_mode, - iree_hal_memory_access_t memory_access, - iree_device_size_t local_byte_offset, iree_device_size_t local_byte_length, - iree_hal_buffer_mapping_t* mapping) { - if (IREE_UNLIKELY(!buffer->allocated_buffer)) { - // Performance warning: this is likely to be happening synchronously in the - // caller in an unexpected way. We could FAILED_PRECONDITION if we wanted - // to be strict but by doing this on-demand we allow deferred buffers to be - // used with callers that may not know that this is a reserved deferred - // buffer (particularly useful for outputs/copy targets). - IREE_RETURN_IF_ERROR(iree_hal_deferred_buffer_commit(buffer)); - } - return _VTABLE_DISPATCH(buffer->allocated_buffer, map_range)( - buffer->allocated_buffer, mapping_mode, memory_access, local_byte_offset, - local_byte_length, mapping); -} - -static iree_status_t iree_hal_deferred_buffer_unmap_range( - iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length, iree_hal_buffer_mapping_t* mapping) { - if (IREE_UNLIKELY(!buffer->allocated_buffer)) { - return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have committed storage"); - } - return _VTABLE_DISPATCH(buffer->allocated_buffer, unmap_range)( - buffer->allocated_buffer, local_byte_offset, local_byte_length, mapping); -} - -static iree_status_t iree_hal_deferred_buffer_invalidate_range( - iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - if (IREE_UNLIKELY(!buffer->allocated_buffer)) { - return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have committed storage"); - } - return _VTABLE_DISPATCH(buffer->allocated_buffer, invalidate_range)( - buffer->allocated_buffer, local_byte_offset, local_byte_length); -} - -static iree_status_t iree_hal_deferred_buffer_flush_range( - iree_hal_buffer_t* buffer, iree_device_size_t local_byte_offset, - iree_device_size_t local_byte_length) { - if (IREE_UNLIKELY(!buffer->allocated_buffer)) { - return iree_make_status(IREE_STATUS_FAILED_PRECONDITION, - "buffer does not have committed storage"); - } - return _VTABLE_DISPATCH(buffer->allocated_buffer, flush_range)( - buffer->allocated_buffer, local_byte_offset, local_byte_length); -} - -static const iree_hal_buffer_vtable_t iree_hal_deferred_buffer_vtable = { - .recycle = iree_hal_buffer_recycle, - .destroy = iree_hal_deferred_buffer_destroy, - .map_range = iree_hal_deferred_buffer_map_range, - .unmap_range = iree_hal_deferred_buffer_unmap_range, - .invalidate_range = iree_hal_deferred_buffer_invalidate_range, - .flush_range = iree_hal_deferred_buffer_flush_range, -}; - //===----------------------------------------------------------------------===// // iree_hal_buffer_t //===----------------------------------------------------------------------===// IREE_API_EXPORT void iree_hal_buffer_initialize( - iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, - iree_hal_buffer_t* allocated_buffer, iree_device_size_t allocation_size, - iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_placement_t placement, iree_hal_buffer_t* allocated_buffer, + iree_device_size_t allocation_size, iree_device_size_t byte_offset, + iree_device_size_t byte_length, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, const iree_hal_buffer_vtable_t* vtable, iree_hal_buffer_t* buffer) { iree_hal_resource_initialize(vtable, &buffer->resource); - buffer->host_allocator = host_allocator; - buffer->device_allocator = device_allocator; + buffer->placement = placement; buffer->allocated_buffer = allocated_buffer; buffer->allocation_size = allocation_size; buffer->byte_offset = byte_offset; @@ -395,8 +238,8 @@ IREE_API_EXPORT void iree_hal_buffer_initialize( IREE_API_EXPORT void iree_hal_buffer_recycle(iree_hal_buffer_t* buffer) { if (IREE_LIKELY(buffer)) { IREE_TRACE_ZONE_BEGIN(z0); - if (buffer->device_allocator) { - iree_hal_allocator_deallocate_buffer(buffer->device_allocator, buffer); + if (buffer->pooling_allocator) { + iree_hal_allocator_deallocate_buffer(buffer->pooling_allocator, buffer); } else { iree_hal_buffer_destroy(buffer); } @@ -633,7 +476,8 @@ IREE_API_EXPORT iree_hal_buffer_overlap_t iree_hal_buffer_test_overlap( IREE_API_EXPORT iree_status_t iree_hal_buffer_subspan( iree_hal_buffer_t* buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_buffer_t** out_buffer) { + iree_device_size_t byte_length, iree_allocator_t host_allocator, + iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(buffer); IREE_ASSERT_ARGUMENT(out_buffer); *out_buffer = NULL; @@ -657,12 +501,11 @@ IREE_API_EXPORT iree_status_t iree_hal_buffer_subspan( iree_hal_buffer_allocated_buffer(buffer); if (allocated_buffer && allocated_buffer != buffer) { return iree_hal_buffer_subspan(allocated_buffer, byte_offset, byte_length, - out_buffer); + host_allocator, out_buffer); } return iree_hal_subspan_buffer_create(buffer, byte_offset, byte_length, - /*device_allocator=*/NULL, - buffer->host_allocator, out_buffer); + host_allocator, out_buffer); } IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_allocated_buffer( @@ -677,6 +520,14 @@ iree_hal_buffer_allocation_size(const iree_hal_buffer_t* buffer) { return buffer->allocation_size; } +IREE_API_EXPORT iree_hal_buffer_placement_t +iree_hal_buffer_allocation_placement(const iree_hal_buffer_t* buffer) { + IREE_ASSERT_ARGUMENT(buffer); + return buffer == buffer->allocated_buffer + ? buffer->placement + : buffer->allocated_buffer->placement; +} + IREE_API_EXPORT iree_device_size_t iree_hal_buffer_byte_offset(const iree_hal_buffer_t* buffer) { IREE_ASSERT_ARGUMENT(buffer); diff --git a/runtime/src/iree/hal/buffer.h b/runtime/src/iree/hal/buffer.h index 82c1f478df22..502a68b35d2f 100644 --- a/runtime/src/iree/hal/buffer.h +++ b/runtime/src/iree/hal/buffer.h @@ -19,6 +19,7 @@ extern "C" { #endif // __cplusplus typedef struct iree_hal_allocator_t iree_hal_allocator_t; +typedef struct iree_hal_device_t iree_hal_device_t; //===----------------------------------------------------------------------===// // Types and Enums @@ -457,6 +458,65 @@ enum iree_hal_mapping_mode_bits_t { }; typedef uint32_t iree_hal_mapping_mode_t; +//===----------------------------------------------------------------------===// +// iree_hal_buffer_placement_t +//===----------------------------------------------------------------------===// + +// Flags describing the placement of a buffer on a device and its allocation +// semantics. This information is only valid on allocated buffers and not +// wrappers that may hold references to them. +typedef uint32_t iree_hal_buffer_placement_flags_t; +enum iree_hal_buffer_placement_flag_bits_t { + IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE = 0u, + // Buffer was allocated with an asynchronous allocation API such as + // iree_hal_device_queue_alloca and/or can be deallocated with an asynchronous + // deallocation API such as iree_hal_device_queue_dealloca. + IREE_HAL_BUFFER_PLACEMENT_FLAG_ASYNCHRONOUS = 1u << 0, + // TODO(benvanik): flags for discrete/external to allow for quick export + // checks. +}; + +// Describes the origin of an allocated buffer. +// This is used internally to route buffers back to pools and can be used by +// hosting layers to route deallocations to appropriate devices/queues. +// This information is generally only valid for allocated buffers (the result of +// an iree_hal_buffer_allocated_buffer query). +typedef struct iree_hal_buffer_placement_t { + // The device the buffer was allocated from. Unretained. + // Only valid for allocated buffers and not any intermediates (subspans, etc). + // May be NULL if the buffer is not associated with any particular device such + // as a free-floating heap-allocated buffer on the host. + iree_hal_device_t* device; + // Queues on the device to which the buffer is available. Depending on the + // device this may indicate which queues have exclusive access to the buffer + // or which queues have optimal access. This may be broader than the original + // request if the buffer is able to be accessed by other queues without + // penalty. Usage of the buffer for queue read/write or asynchronous + // deallocation via iree_hal_device_queue_dealloca is only legal with a queue + // affinity that is a subset of this affinity set. + iree_hal_queue_affinity_t queue_affinity; + // Describes the placement behavior of a buffer on a device and its allocation + // semantics. + iree_hal_buffer_placement_flags_t flags; + uint32_t reserved; +} iree_hal_buffer_placement_t; + +// Returns a placement indicating that the buffer has no direct device it is +// associated with. Commonly used for free-floating buffer handles such as heap +// wrapped or allocated buffers that come from outside of the HAL. +static inline iree_hal_buffer_placement_t iree_hal_buffer_placement_undefined( + void) { + iree_hal_buffer_placement_t placement = {0}; + return placement; +} + +// Returns true if the |placement| is undefined and the buffer has no direct +// device it is associated with. +static inline bool iree_hal_buffer_placement_is_undefined( + const iree_hal_buffer_placement_t placement) { + return placement.device == NULL; +} + //===----------------------------------------------------------------------===// // iree_hal_buffer_params_t //===----------------------------------------------------------------------===// @@ -699,7 +759,8 @@ IREE_API_EXPORT iree_hal_buffer_overlap_t iree_hal_buffer_test_overlap( // |out_buffer| must be released by the caller. IREE_API_EXPORT iree_status_t iree_hal_buffer_subspan( iree_hal_buffer_t* buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_buffer_t** out_buffer); + iree_device_size_t byte_length, iree_allocator_t host_allocator, + iree_hal_buffer_t** out_buffer); // Retains the given |buffer| for the caller. IREE_API_EXPORT void iree_hal_buffer_retain(iree_hal_buffer_t* buffer); @@ -720,6 +781,20 @@ IREE_API_EXPORT iree_hal_buffer_t* iree_hal_buffer_allocated_buffer( IREE_API_EXPORT iree_device_size_t iree_hal_buffer_allocation_size(const iree_hal_buffer_t* buffer); +// Returns the original placement of the allocated buffer. +// The placement applies to the entire underlying allocated buffer and not the +// potential subspan of the |buffer| handle. Many buffer handles may be backed +// by the same allocation. It's possible for placements to change over the +// lifetime of a buffer as it is moved across devices but the origin will always +// accept actions on the buffer such as deallocation. +// +// Note that not all buffers have a placement: e.g. host buffers allocated as +// free-floating objects will have no device assigned. +// iree_hal_buffer_placement_is_undefined can be used to check for this case +// explicitly. +IREE_API_EXPORT iree_hal_buffer_placement_t +iree_hal_buffer_allocation_placement(const iree_hal_buffer_t* buffer); + // Returns the offset in bytes of the buffer within its allocated_buffer. IREE_API_EXPORT iree_device_size_t iree_hal_buffer_byte_offset(const iree_hal_buffer_t* buffer); @@ -916,62 +991,14 @@ IREE_API_EXPORT iree_status_t iree_hal_buffer_mapping_subspan( // iree_hal_subspan_buffer_t //===----------------------------------------------------------------------===// -// Initializes in-place a subspan buffer stored in |out_buffer|. -// The reference count of the buffer will be set to 1. -// -// This is intended to be used for provably on-stack transient subspans or -// buffer wrapping where ownership is controlled externally. If the lifetime of -// the subspan may extend beyond the lifetime of the |out_buffer| storage then -// iree_hal_subspan_buffer_create must be used instead. -// -// iree_hal_subspan_buffer_deinitialize must be used to deinitialize the buffer. -IREE_API_EXPORT void iree_hal_subspan_buffer_initialize( - iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator, - iree_allocator_t host_allocator, iree_hal_buffer_t* out_buffer); - -// Deinitializes a subspan buffer that was initialized with -// iree_hal_subspan_buffer_initialize. -IREE_API_EXPORT void iree_hal_subspan_buffer_deinitialize( - iree_hal_buffer_t* buffer); - // Creates a buffer referencing a subspan of some base allocation. // Optionally |device_allocator| can be provided if this subspan references // managed buffers that need deallocation callbacks. IREE_API_EXPORT iree_status_t iree_hal_subspan_buffer_create( iree_hal_buffer_t* allocated_buffer, iree_device_size_t byte_offset, - iree_device_size_t byte_length, iree_hal_allocator_t* device_allocator, - iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); - -//===----------------------------------------------------------------------===// -// iree_hal_deferred_buffer_t -//===----------------------------------------------------------------------===// - -// Creates a buffer with the given properties that has no backing storage. -// The buffer can be passed around/retained/etc with just the reservation and -// committed/decommitted on demand. All usage of the buffer beyond metadata -// queries requires that it be committed. -// -// WARNING: commit/decommit are thread-compatible. Callers must ensure that no -// threads try to use the buffer contents before a commit has completed and that -// no threads still have access to the buffer contents prior to a decommit. -IREE_API_EXPORT iree_status_t iree_hal_deferred_buffer_create_reserved( - iree_hal_allocator_t* device_allocator, iree_device_size_t allocation_size, - iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_buffer_params_t params, iree_allocator_t host_allocator, + iree_device_size_t byte_length, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); -// Commits the backing storage of the |buffer| from its device allocator. -// Ignored if the buffer is already committed. -IREE_API_EXPORT iree_status_t -iree_hal_deferred_buffer_commit(iree_hal_buffer_t* buffer); - -// Decommits the backing storage of the |buffer| and returns it to a -// metadata-only state. No other threads must still have access to the buffer -// contents. -IREE_API_EXPORT iree_status_t -iree_hal_deferred_buffer_decommit(iree_hal_buffer_t* buffer); - //===----------------------------------------------------------------------===// // iree_hal_heap_buffer_t //===----------------------------------------------------------------------===// @@ -985,11 +1012,11 @@ iree_hal_deferred_buffer_decommit(iree_hal_buffer_t* buffer); // |out_buffer| must be released by the caller. |data| must be kept live for the // lifetime of the wrapping buffer. iree_status_t iree_hal_heap_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_byte_span_t data, iree_hal_buffer_release_callback_t release_callback, - iree_hal_buffer_t** out_buffer); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); //===----------------------------------------------------------------------===// // iree_hal_buffer_t implementation details @@ -1024,41 +1051,81 @@ static_assert(offsetof(iree_hal_buffer_vtable_t, recycle) == 0, "iree_hal_resource_vtable_t expects destroy at offset 0, we want " "to recycle instead"); +// NOTE: this shared data structure may be a mistake. If vtables were free we +// would not provide this and rely on each buffer implementation to implement +// all of the accessor methods. Indirection through vtables costs, though, so +// we hoist the common information that every buffer implementation needs here. +// Since this adds a fixed cost to every buffer on every implementation we +// should keep the structure as small as reasonable. +// +// NOTE: the internals of this structure are an implementation detail and may +// change at any time. If there's no API accessor for a field then assume it +// should not be used except by HAL buffer implementations. struct iree_hal_buffer_t { - // Frequently accessed: iree_hal_resource_t resource; // must be at 0 + // Underlying buffer allocation. If this points back at this buffer structure + // then the buffer is an allocated buffer itself and otherwise the underlying + // allocation is referenced and retained. iree_hal_buffer_t* allocated_buffer; + // Total size of the buffer allocation in its underlying storage. + // This is captured on each buffer including non-allocated buffers so that + // internal pooling/suballocation costs can be represented. iree_device_size_t allocation_size; + // Offset into the underlying allocated buffer this buffer range starts at. iree_device_size_t byte_offset; + // Length of the buffer range in the underlying allocated buffer. This is the + // logical length exposed to users. iree_device_size_t byte_length; - // Rarely accessed: - iree_allocator_t host_allocator; - iree_hal_allocator_t* device_allocator; + // Placement of the buffer on a device/queue set. Captured only for allocated + // buffers. + iree_hal_buffer_placement_t placement; + + // Hacky back reference to an allocator that should be notified when the + // buffer is released. This is a hack to support interception of buffers by + // pooling layers and is slated for removal. + // + // TODO(#19159): remove iree_hal_allocator_deallocate_buffer when pooling no + // longer requires the pooling_allocator on iree_hal_buffer_t. + iree_hal_allocator_t* pooling_allocator; + // TODO(benvanik): bit pack these; could be ~4 bytes vs 12. iree_hal_memory_type_t memory_type; iree_hal_buffer_usage_t allowed_usage; iree_hal_memory_access_t allowed_access; - // Implementation-defined flags. - uint16_t flags; + // Unused padding that more flags or identifiers can be placed in, such as + // which implementation pool owns the buffer. + uint16_t reserved; + + // Implementation-defined flags used for additional bookkeeping or routing + // by the buffer implementation. + uint32_t flags; }; IREE_API_EXPORT void iree_hal_buffer_initialize( - iree_allocator_t host_allocator, iree_hal_allocator_t* device_allocator, - iree_hal_buffer_t* allocated_buffer, iree_device_size_t allocation_size, - iree_device_size_t byte_offset, iree_device_size_t byte_length, - iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + iree_hal_buffer_placement_t placement, iree_hal_buffer_t* allocated_buffer, + iree_device_size_t allocation_size, iree_device_size_t byte_offset, + iree_device_size_t byte_length, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, const iree_hal_buffer_vtable_t* vtable, iree_hal_buffer_t* buffer); -// Recycles |buffer| by returning it to its allocator (or destroying it). +// TODO(#19159): remove iree_hal_allocator_deallocate_buffer when pooling no +// longer requires the pooling_allocator on iree_hal_buffer_t. When buffers can +// use their normal destroy callback to return themselves to pools then we won't +// need this extra recycle thunk. +// +// Recycles |buffer| by releasing it to the origin it is associated with via the +// release callback (or destroying it, if none was specified). // The |buffer| pointer may remain valid if it is returned to a pool but callers -// must assume its contents are undefined. +// must assume its contents are undefined as if it had been freed. IREE_API_EXPORT void iree_hal_buffer_recycle(iree_hal_buffer_t* buffer); // Destroys |buffer| and frees its memory. -// Implementations should use iree_hal_buffer_recycle in their vtables. +// Implementations must use iree_hal_buffer_recycle in their vtables for the +// common iree_hal_resource_t destroy callback as this is only to be used by +// release callbacks that want to free the buffer. IREE_API_EXPORT void iree_hal_buffer_destroy(iree_hal_buffer_t* buffer); #ifdef __cplusplus diff --git a/runtime/src/iree/hal/buffer_heap.c b/runtime/src/iree/hal/buffer_heap.c index e7b763e9d9ed..93b38270c6e9 100644 --- a/runtime/src/iree/hal/buffer_heap.c +++ b/runtime/src/iree/hal/buffer_heap.c @@ -34,7 +34,14 @@ typedef struct iree_hal_heap_buffer_t { // base.flags has the iree_hal_heap_buffer_storage_mode_t. iree_hal_buffer_t base; + // Host allocator this buffer metadata structure was allocated from. May be + // different than the data allocator used for the buffer payload. + iree_allocator_t host_allocator; + + // TODO(benvanik): change to a raw pointer as the base.allocation_size is the + // same as the data_length. iree_byte_span_t data; + union { // Used for IREE_HAL_HEAP_BUFFER_STORAGE_MODE_SPLIT. iree_allocator_t data_allocator; @@ -45,9 +52,6 @@ typedef struct iree_hal_heap_buffer_t { // Optional statistics shared with the allocator. IREE_STATISTICS(iree_hal_heap_allocator_statistics_t* statistics;) } iree_hal_heap_buffer_t; -static_assert(sizeof(iree_hal_heap_buffer_t) <= 128, - "header should be <= the minimum buffer alignment so that we " - "don't introduce internal waste"); static const iree_hal_buffer_vtable_t iree_hal_heap_buffer_vtable; @@ -111,12 +115,10 @@ static iree_status_t iree_hal_heap_buffer_allocate_slab( } iree_status_t iree_hal_heap_buffer_create( - iree_hal_allocator_t* allocator, iree_hal_heap_allocator_statistics_t* statistics, const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size, iree_allocator_t data_allocator, iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(allocator); IREE_ASSERT_ARGUMENT(params); IREE_ASSERT_ARGUMENT(out_buffer); IREE_TRACE_ZONE_BEGIN(z0); @@ -137,10 +139,11 @@ iree_status_t iree_hal_heap_buffer_create( host_allocator, &buffer, &data); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, - allocation_size, 0, allocation_size, - params->type, params->access, params->usage, - &iree_hal_heap_buffer_vtable, &buffer->base); + iree_hal_buffer_initialize( + iree_hal_buffer_placement_undefined(), &buffer->base, allocation_size, + 0, allocation_size, params->type, params->access, params->usage, + &iree_hal_heap_buffer_vtable, &buffer->base); + buffer->host_allocator = host_allocator; buffer->data = data; if (same_allocator) { @@ -169,12 +172,11 @@ iree_status_t iree_hal_heap_buffer_create( } iree_status_t iree_hal_heap_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_byte_span_t data, iree_hal_buffer_release_callback_t release_callback, - iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(allocator); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { IREE_ASSERT_ARGUMENT(out_buffer); IREE_TRACE_ZONE_BEGIN(z0); @@ -188,16 +190,15 @@ iree_status_t iree_hal_heap_buffer_wrap( (int)IREE_HAL_HEAP_BUFFER_ALIGNMENT, data.data); } - iree_allocator_t host_allocator = - iree_hal_allocator_host_allocator(allocator); iree_hal_heap_buffer_t* buffer = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, - allocation_size, 0, data.data_length, - memory_type, allowed_access, allowed_usage, - &iree_hal_heap_buffer_vtable, &buffer->base); + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, 0, + data.data_length, memory_type, allowed_access, + allowed_usage, &iree_hal_heap_buffer_vtable, + &buffer->base); + buffer->host_allocator = host_allocator; buffer->data = data; // Notify the provided callback when the external data is no longer needed. @@ -213,7 +214,7 @@ iree_status_t iree_hal_heap_buffer_wrap( static void iree_hal_heap_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_heap_buffer_t* buffer = (iree_hal_heap_buffer_t*)base_buffer; - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); IREE_STATISTICS({ diff --git a/runtime/src/iree/hal/buffer_heap_impl.h b/runtime/src/iree/hal/buffer_heap_impl.h index 6068c28549c4..f358db87facb 100644 --- a/runtime/src/iree/hal/buffer_heap_impl.h +++ b/runtime/src/iree/hal/buffer_heap_impl.h @@ -31,7 +31,6 @@ typedef struct iree_hal_heap_allocator_statistics_t { // |data_allocator| and |host_allocator| are the same the buffer will be created // as a flat slab. |out_buffer| must be released by the caller. iree_status_t iree_hal_heap_buffer_create( - iree_hal_allocator_t* allocator, iree_hal_heap_allocator_statistics_t* statistics, const iree_hal_buffer_params_t* params, iree_device_size_t allocation_size, iree_allocator_t data_allocator, iree_allocator_t host_allocator, diff --git a/runtime/src/iree/hal/cts/buffer_mapping_test.h b/runtime/src/iree/hal/cts/buffer_mapping_test.h index 504f3d5e9c40..b78c8a84fa96 100644 --- a/runtime/src/iree/hal/cts/buffer_mapping_test.h +++ b/runtime/src/iree/hal/cts/buffer_mapping_test.h @@ -140,8 +140,9 @@ TEST_F(BufferMappingTest, ZeroSubspan) { // Create a subspan. iree_device_size_t subspan_length = 8; iree_hal_buffer_t* buffer_subspan = NULL; - IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, - subspan_length, &buffer_subspan)); + IREE_ASSERT_OK( + iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, subspan_length, + iree_allocator_system(), &buffer_subspan)); // Zero part of the subspan. IREE_ASSERT_OK(iree_hal_buffer_map_zero(buffer_subspan, /*byte_offset=*/4, @@ -253,8 +254,9 @@ TEST_F(BufferMappingTest, FillSubspan) { // Create a subspan. iree_device_size_t subspan_length = 8; iree_hal_buffer_t* buffer_subspan = NULL; - IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, - subspan_length, &buffer_subspan)); + IREE_ASSERT_OK( + iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, subspan_length, + iree_allocator_system(), &buffer_subspan)); // Fill part of the subspan. uint8_t fill_value = 0xFF; @@ -342,8 +344,9 @@ TEST_F(BufferMappingTest, ReadDataSubspan) { // Create a subspan. iree_device_size_t subspan_length = 8; iree_hal_buffer_t* buffer_subspan = NULL; - IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, - subspan_length, &buffer_subspan)); + IREE_ASSERT_OK( + iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, subspan_length, + iree_allocator_system(), &buffer_subspan)); // Read the entire buffer subspan. std::vector actual_data(subspan_length); @@ -426,8 +429,9 @@ TEST_F(BufferMappingTest, WriteDataSubspan) { // Create a subspan. iree_device_size_t subspan_length = 8; iree_hal_buffer_t* buffer_subspan = NULL; - IREE_ASSERT_OK(iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, - subspan_length, &buffer_subspan)); + IREE_ASSERT_OK( + iree_hal_buffer_subspan(buffer, /*byte_offset=*/4, subspan_length, + iree_allocator_system(), &buffer_subspan)); // Write over part of the subspan. std::vector fill_buffer{0x11, 0x22, 0x33, 0x44}; diff --git a/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h b/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h index 0ba96e478b99..861ff5711b01 100644 --- a/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h +++ b/runtime/src/iree/hal/cts/command_buffer_update_buffer_test.h @@ -116,8 +116,9 @@ TEST_F(CommandBufferUpdateBufferTest, UpdateBufferSubspan) { // Create a subspan. iree_device_size_t subspan_length = 8; iree_hal_buffer_t* buffer_subspan; - IREE_ASSERT_OK(iree_hal_buffer_subspan(device_buffer, /*byte_offset=*/4, - subspan_length, &buffer_subspan)); + IREE_ASSERT_OK( + iree_hal_buffer_subspan(device_buffer, /*byte_offset=*/4, subspan_length, + iree_allocator_system(), &buffer_subspan)); iree_hal_command_buffer_t* command_buffer = NULL; IREE_CHECK_OK(iree_hal_command_buffer_create( diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c index 72332dbd4879..fde4abe178e1 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.c @@ -22,6 +22,9 @@ typedef struct iree_hal_cuda_allocator_t { // must be at offset 0. iree_hal_resource_t resource; + // Parent device that this allocator is associated with. Unowned. + iree_hal_device_t* parent_device; + // The device that this allocator allocates memory from. CUdevice device; @@ -55,9 +58,11 @@ static iree_hal_cuda_allocator_t* iree_hal_cuda_allocator_cast( } iree_status_t iree_hal_cuda_allocator_create( + iree_hal_device_t* parent_device, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(parent_device); IREE_ASSERT_ARGUMENT(cuda_symbols); IREE_ASSERT_ARGUMENT(out_allocator); IREE_TRACE_ZONE_BEGIN(z0); @@ -104,6 +109,7 @@ iree_status_t iree_hal_cuda_allocator_create( iree_hal_resource_initialize(&iree_hal_cuda_allocator_vtable, &allocator->resource); + allocator->parent_device = parent_device; allocator->device = device; allocator->stream = stream; allocator->pools = pools; @@ -419,8 +425,14 @@ static iree_status_t iree_hal_cuda_allocator_allocate_buffer( iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; status = iree_hal_cuda_buffer_wrap( - base_allocator, compat_params.type, compat_params.access, + placement, compat_params.type, compat_params.access, compat_params.usage, allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr, @@ -584,8 +596,14 @@ static iree_status_t iree_hal_cuda_allocator_import_buffer( iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; status = iree_hal_cuda_buffer_wrap( - base_allocator, compat_params.type, compat_params.access, + placement, compat_params.type, compat_params.access, compat_params.usage, external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, buffer_type, device_ptr, host_ptr, release_callback, diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h index 3b4dbb3886d2..845ef3e3411c 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h +++ b/runtime/src/iree/hal/drivers/cuda/cuda_allocator.h @@ -22,6 +22,7 @@ extern "C" { // and the pointer must remain valid for the lifetime of the allocator. Pools // may not be supported on all devices and can be NULL. iree_status_t iree_hal_cuda_allocator_create( + iree_hal_device_t* parent_device, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, CUdevice device, CUstream stream, iree_hal_cuda_memory_pools_t* pools, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator); diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c index 8f73aa12ea0d..ab33c5f74e9f 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.c @@ -14,6 +14,7 @@ typedef struct iree_hal_cuda_buffer_t { iree_hal_buffer_t base; + iree_allocator_t host_allocator; iree_hal_cuda_buffer_type_t type; void* host_ptr; CUdeviceptr device_ptr; @@ -35,7 +36,7 @@ static const iree_hal_cuda_buffer_t* iree_hal_cuda_buffer_const_cast( } iree_status_t iree_hal_cuda_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -56,10 +57,11 @@ iree_status_t iree_hal_cuda_buffer_wrap( iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, - allocation_size, byte_offset, byte_length, - memory_type, allowed_access, allowed_usage, + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, + byte_offset, byte_length, memory_type, + allowed_access, allowed_usage, &iree_hal_cuda_buffer_vtable, &buffer->base); + buffer->host_allocator = host_allocator; buffer->type = buffer_type; buffer->host_ptr = host_ptr; buffer->device_ptr = device_ptr; @@ -73,7 +75,7 @@ iree_status_t iree_hal_cuda_buffer_wrap( static void iree_hal_cuda_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_cuda_buffer_t* buffer = iree_hal_cuda_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); if (buffer->release_callback.fn) { buffer->release_callback.fn(buffer->release_callback.user_data, diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h index 3c677ae7d27d..16a8ae8becae 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h +++ b/runtime/src/iree/hal/drivers/cuda/cuda_buffer.h @@ -34,7 +34,7 @@ typedef enum iree_hal_cuda_buffer_type_e { // Wraps a CUDA allocation in an iree_hal_buffer_t. iree_status_t iree_hal_cuda_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, diff --git a/runtime/src/iree/hal/drivers/cuda/cuda_device.c b/runtime/src/iree/hal/drivers/cuda/cuda_device.c index d5ed44816056..9c9c7c61e02c 100644 --- a/runtime/src/iree/hal/drivers/cuda/cuda_device.c +++ b/runtime/src/iree/hal/drivers/cuda/cuda_device.c @@ -496,13 +496,13 @@ static iree_status_t iree_hal_cuda_device_create_internal( // Create memory pools first so that we can share them with the allocator. if (iree_status_is_ok(status) && device->supports_memory_pools) { status = iree_hal_cuda_memory_pools_initialize( - cuda_symbols, cu_device, ¶ms->memory_pools, host_allocator, - &device->memory_pools); + (iree_hal_device_t*)device, cuda_symbols, cu_device, + ¶ms->memory_pools, host_allocator, &device->memory_pools); } if (iree_status_is_ok(status)) { status = iree_hal_cuda_allocator_create( - cuda_symbols, cu_device, dispatch_stream, + (iree_hal_device_t*)device, cuda_symbols, cu_device, dispatch_stream, device->supports_memory_pools ? &device->memory_pools : NULL, host_allocator, &device->device_allocator); } diff --git a/runtime/src/iree/hal/drivers/cuda/memory_pools.c b/runtime/src/iree/hal/drivers/cuda/memory_pools.c index 1e34422478f5..ac53271283b0 100644 --- a/runtime/src/iree/hal/drivers/cuda/memory_pools.c +++ b/runtime/src/iree/hal/drivers/cuda/memory_pools.c @@ -58,16 +58,19 @@ static iree_status_t iree_hal_cuda_create_memory_pool( } iree_status_t iree_hal_cuda_memory_pools_initialize( + iree_hal_device_t* parent_device, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, const iree_hal_cuda_memory_pooling_params_t* pooling_params, iree_allocator_t host_allocator, iree_hal_cuda_memory_pools_t* IREE_RESTRICT out_pools) { + IREE_ASSERT_ARGUMENT(parent_device); IREE_ASSERT_ARGUMENT(cuda_symbols); IREE_ASSERT_ARGUMENT(pooling_params); IREE_ASSERT_ARGUMENT(out_pools); IREE_TRACE_ZONE_BEGIN(z0); memset(out_pools, 0, sizeof(*out_pools)); + out_pools->parent_device = parent_device; out_pools->cuda_symbols = cuda_symbols; out_pools->host_allocator = host_allocator; @@ -241,13 +244,19 @@ iree_status_t iree_hal_cuda_memory_pools_alloca( // doesn't dealloca the buffer. iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { + const iree_hal_buffer_placement_t placement = { + .device = pools->parent_device, + .queue_affinity = params.queue_affinity ? params.queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_ASYNCHRONOUS, + }; iree_hal_buffer_release_callback_t release_callback = { .fn = iree_hal_cuda_async_buffer_release_callback, .user_data = pools, }; status = iree_hal_cuda_buffer_wrap( - /*device_allocator=*/NULL, params.type, params.access, params.usage, - allocation_size, /*byte_offset=*/0, + placement, params.type, params.access, params.usage, allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, IREE_HAL_CUDA_BUFFER_TYPE_ASYNC, device_ptr, /*host_ptr=*/NULL, release_callback, pools->host_allocator, &buffer); diff --git a/runtime/src/iree/hal/drivers/cuda/memory_pools.h b/runtime/src/iree/hal/drivers/cuda/memory_pools.h index 98917dc04431..99e4d271447f 100644 --- a/runtime/src/iree/hal/drivers/cuda/memory_pools.h +++ b/runtime/src/iree/hal/drivers/cuda/memory_pools.h @@ -25,6 +25,7 @@ typedef struct iree_hal_cuda_memory_pools_t { // Used for any host-visible/host-local memory types. CUmemoryPool other; + iree_hal_device_t* parent_device; const iree_hal_cuda_dynamic_symbols_t* cuda_symbols; iree_allocator_t host_allocator; @@ -38,6 +39,7 @@ typedef struct iree_hal_cuda_memory_pools_t { // Initializes |out_pools| by configuring new CUDA memory pools. iree_status_t iree_hal_cuda_memory_pools_initialize( + iree_hal_device_t* parent_device, const iree_hal_cuda_dynamic_symbols_t* cuda_symbols, CUdevice cu_device, const iree_hal_cuda_memory_pooling_params_t* pooling_params, iree_allocator_t host_allocator, diff --git a/runtime/src/iree/hal/drivers/hip/hip_allocator.c b/runtime/src/iree/hal/drivers/hip/hip_allocator.c index 95a04cd1812b..c8bc93f52aa4 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_allocator.c +++ b/runtime/src/iree/hal/drivers/hip/hip_allocator.c @@ -24,6 +24,9 @@ typedef struct iree_hal_hip_allocator_t { // must be at offset 0. iree_hal_resource_t resource; + // Parent device that this allocator is associated with. Unowned. + iree_hal_device_t* parent_device; + // The device that this allocator allocates memory from. hipDevice_t device; @@ -56,10 +59,12 @@ static iree_hal_hip_allocator_t* iree_hal_hip_allocator_cast( } iree_status_t iree_hal_hip_allocator_create( + iree_hal_device_t* parent_device, const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t device, hipCtx_t hip_context, hipStream_t stream, iree_hal_hip_memory_pools_t* pools, iree_allocator_t host_allocator, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(parent_device); IREE_ASSERT_ARGUMENT(hip_symbols); IREE_ASSERT_ARGUMENT(out_allocator); IREE_TRACE_ZONE_BEGIN(z0); @@ -93,6 +98,7 @@ iree_status_t iree_hal_hip_allocator_create( (void**)&allocator)); iree_hal_resource_initialize(&iree_hal_hip_allocator_vtable, &allocator->resource); + allocator->parent_device = parent_device; allocator->device = device; allocator->stream = stream; allocator->pools = pools; @@ -408,8 +414,14 @@ static iree_status_t iree_hal_hip_allocator_allocate_buffer( iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; status = iree_hal_hip_buffer_wrap( - base_allocator, compat_params.type, compat_params.access, + placement, compat_params.type, compat_params.access, compat_params.usage, allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, buffer_type, device_ptr, host_ptr, @@ -556,8 +568,14 @@ static iree_status_t iree_hal_hip_allocator_import_buffer( iree_hal_buffer_t* buffer = NULL; if (iree_status_is_ok(status)) { + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; status = iree_hal_hip_buffer_wrap( - base_allocator, compat_params.type, compat_params.access, + placement, compat_params.type, compat_params.access, compat_params.usage, external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, buffer_type, device_ptr, diff --git a/runtime/src/iree/hal/drivers/hip/hip_allocator.h b/runtime/src/iree/hal/drivers/hip/hip_allocator.h index 89ed0936be0c..5c19a7a957df 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_allocator.h +++ b/runtime/src/iree/hal/drivers/hip/hip_allocator.h @@ -21,6 +21,7 @@ extern "C" { // |pools| provides memory pools that may be shared across multiple allocators // and the pointer must remain valid for the lifetime of the allocator. iree_status_t iree_hal_hip_allocator_create( + iree_hal_device_t* parent_device, const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t device, hipCtx_t hip_context, hipStream_t stream, iree_hal_hip_memory_pools_t* pools, iree_allocator_t host_allocator, diff --git a/runtime/src/iree/hal/drivers/hip/hip_buffer.c b/runtime/src/iree/hal/drivers/hip/hip_buffer.c index 46e768ace25f..a0efa9a60d8f 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_buffer.c +++ b/runtime/src/iree/hal/drivers/hip/hip_buffer.c @@ -16,6 +16,7 @@ typedef struct iree_hal_hip_buffer_t { iree_hal_buffer_t base; + iree_allocator_t host_allocator; iree_hal_hip_buffer_type_t type; void* host_ptr; hipDeviceptr_t device_ptr; @@ -40,7 +41,7 @@ static const iree_hal_hip_buffer_t* iree_hal_hip_buffer_const_cast( } iree_status_t iree_hal_hip_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -61,10 +62,11 @@ iree_status_t iree_hal_hip_buffer_wrap( iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, - allocation_size, byte_offset, byte_length, - memory_type, allowed_access, allowed_usage, + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, + byte_offset, byte_length, memory_type, + allowed_access, allowed_usage, &iree_hal_hip_buffer_vtable, &buffer->base); + buffer->host_allocator = host_allocator; buffer->type = buffer_type; buffer->host_ptr = host_ptr; buffer->device_ptr = device_ptr; @@ -101,7 +103,7 @@ void iree_hal_hip_buffer_set_allocation_empty(iree_hal_buffer_t* base_buffer) { static void iree_hal_hip_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_hip_buffer_t* buffer = iree_hal_hip_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); if (buffer->release_callback.fn) { buffer->release_callback.fn(buffer->release_callback.user_data, diff --git a/runtime/src/iree/hal/drivers/hip/hip_buffer.h b/runtime/src/iree/hal/drivers/hip/hip_buffer.h index 4264b54ddb14..3a18956effde 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_buffer.h +++ b/runtime/src/iree/hal/drivers/hip/hip_buffer.h @@ -34,7 +34,7 @@ typedef enum iree_hal_hip_buffer_type_e { // Wraps a HIP allocation in an iree_hal_buffer_t. iree_status_t iree_hal_hip_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, diff --git a/runtime/src/iree/hal/drivers/hip/hip_device.c b/runtime/src/iree/hal/drivers/hip/hip_device.c index bc001d645c3a..c6fe92967cd6 100644 --- a/runtime/src/iree/hal/drivers/hip/hip_device.c +++ b/runtime/src/iree/hal/drivers/hip/hip_device.c @@ -539,13 +539,14 @@ static iree_status_t iree_hal_hip_device_create_internal( // Create memory pools first so that we can share them with the allocator. if (iree_status_is_ok(status) && device->supports_memory_pools) { status = iree_hal_hip_memory_pools_initialize( - symbols, hip_device, context, ¶ms->memory_pools, host_allocator, - &device->memory_pools); + (iree_hal_device_t*)device, symbols, hip_device, context, + ¶ms->memory_pools, host_allocator, &device->memory_pools); } if (iree_status_is_ok(status)) { status = iree_hal_hip_allocator_create( - symbols, hip_device, context, dispatch_stream, + (iree_hal_device_t*)device, symbols, hip_device, context, + dispatch_stream, device->supports_memory_pools ? &device->memory_pools : NULL, host_allocator, &device->device_allocator); } @@ -1001,10 +1002,16 @@ static iree_status_t iree_hal_hip_device_pepare_async_alloc( iree_hal_buffer_params_canonicalize(¶ms); + const iree_hal_buffer_placement_t placement = { + .device = (iree_hal_device_t*)device, + .queue_affinity = params.queue_affinity ? params.queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_ASYNCHRONOUS, + }; iree_hal_buffer_t* buffer = NULL; iree_status_t status = iree_hal_hip_buffer_wrap( - device->device_allocator, params.type, params.access, params.usage, - allocation_size, /*byte_offset=*/0, + placement, params.type, params.access, params.usage, allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, IREE_HAL_HIP_BUFFER_TYPE_ASYNC, /*device_ptr=*/NULL, /*host_ptr=*/NULL, iree_hal_buffer_release_callback_null(), device->host_allocator, &buffer); diff --git a/runtime/src/iree/hal/drivers/hip/memory_pools.c b/runtime/src/iree/hal/drivers/hip/memory_pools.c index 93a046b1f59e..c5a927fc0bbd 100644 --- a/runtime/src/iree/hal/drivers/hip/memory_pools.c +++ b/runtime/src/iree/hal/drivers/hip/memory_pools.c @@ -59,11 +59,13 @@ static iree_status_t iree_hal_hip_create_memory_pool( } iree_status_t iree_hal_hip_memory_pools_initialize( + iree_hal_device_t* parent_device, const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t hip_device, hipCtx_t hip_context, const iree_hal_hip_memory_pooling_params_t* pooling_params, iree_allocator_t host_allocator, iree_hal_hip_memory_pools_t* IREE_RESTRICT out_pools) { + IREE_ASSERT_ARGUMENT(parent_device); IREE_ASSERT_ARGUMENT(hip_symbols); IREE_ASSERT_ARGUMENT(pooling_params); IREE_ASSERT_ARGUMENT(out_pools); @@ -72,6 +74,7 @@ iree_status_t iree_hal_hip_memory_pools_initialize( z0, iree_hal_hip_set_context(hip_symbols, hip_context)); memset(out_pools, 0, sizeof(*out_pools)); + out_pools->parent_device = parent_device; out_pools->hip_symbols = hip_symbols; out_pools->host_allocator = host_allocator; out_pools->hip_context = hip_context; @@ -267,14 +270,20 @@ iree_status_t iree_hal_hip_memory_pools_prepare_buffer( // NOTE: we don't provide a device allocator because we didn't allocate from // one and instead we use a release callback to perform the free if the user // doesn't dealloca the buffer. - iree_hal_buffer_t* buffer = NULL; + const iree_hal_buffer_placement_t placement = { + .device = pools->parent_device, + .queue_affinity = params.queue_affinity ? params.queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_ASYNCHRONOUS, + }; iree_hal_buffer_release_callback_t release_callback = { .fn = iree_hal_hip_async_buffer_release_callback, .user_data = pools, }; + iree_hal_buffer_t* buffer = NULL; iree_status_t status = iree_hal_hip_buffer_wrap( - /*device_allocator=*/NULL, params.type, params.access, params.usage, - allocation_size, /*byte_offset=*/0, + placement, params.type, params.access, params.usage, allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, IREE_HAL_HIP_BUFFER_TYPE_ASYNC, /*device_ptr*/ NULL, /*host_ptr=*/NULL, release_callback, pools->host_allocator, &buffer); diff --git a/runtime/src/iree/hal/drivers/hip/memory_pools.h b/runtime/src/iree/hal/drivers/hip/memory_pools.h index f95b76d9816b..7d66090e33f6 100644 --- a/runtime/src/iree/hal/drivers/hip/memory_pools.h +++ b/runtime/src/iree/hal/drivers/hip/memory_pools.h @@ -32,6 +32,7 @@ typedef struct iree_hal_hip_memory_pools_t { // Used for any host-visible/host-local memory types. hipMemPool_t other; + iree_hal_device_t* parent_device; const iree_hal_hip_dynamic_symbols_t* hip_symbols; hipCtx_t hip_context; iree_allocator_t host_allocator; @@ -46,6 +47,7 @@ typedef struct iree_hal_hip_memory_pools_t { // Initializes |out_pools| by configuring new HIP memory pools. iree_status_t iree_hal_hip_memory_pools_initialize( + iree_hal_device_t* parent_device, const iree_hal_hip_dynamic_symbols_t* hip_symbols, hipDevice_t hip_device, hipCtx_t hip_context, const iree_hal_hip_memory_pooling_params_t* pooling_params, diff --git a/runtime/src/iree/hal/drivers/metal/direct_allocator.h b/runtime/src/iree/hal/drivers/metal/direct_allocator.h index bbd460916ce8..6b5b49122032 100644 --- a/runtime/src/iree/hal/drivers/metal/direct_allocator.h +++ b/runtime/src/iree/hal/drivers/metal/direct_allocator.h @@ -26,7 +26,7 @@ extern "C" { // |out_allocator| must be released by the caller (see // iree_hal_allocator_release). iree_status_t iree_hal_metal_allocator_create( - id device, + iree_hal_device_t* parent_device, id device, #if defined(IREE_PLATFORM_MACOS) id queue, #endif // IREE_PLATFORM_MACOS diff --git a/runtime/src/iree/hal/drivers/metal/direct_allocator.m b/runtime/src/iree/hal/drivers/metal/direct_allocator.m index d97886e6f7c2..b9c6a40c4820 100644 --- a/runtime/src/iree/hal/drivers/metal/direct_allocator.m +++ b/runtime/src/iree/hal/drivers/metal/direct_allocator.m @@ -22,6 +22,9 @@ // Abstract resource used for injecting reference counting and vtable; must be at offset 0. iree_hal_resource_t resource; + // Parent device that this allocator is associated with. Unowned. + iree_hal_device_t* parent_device; + // The device that this allocator is attached to. id device; // The command queue that we can use to issue commands to make buffer contents visible to CPU. @@ -51,7 +54,7 @@ } iree_status_t iree_hal_metal_allocator_create( - id device, + iree_hal_device_t* parent_device, id device, #if defined(IREE_PLATFORM_MACOS) id queue, #endif // IREE_PLATFORM_MACOS @@ -273,14 +276,22 @@ static iree_status_t iree_hal_metal_allocator_allocate_buffer( IREE_TRACE_ZONE_END(z0); return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, "unable to allocate buffer"); } + + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = + params->queue_affinity ? params->queue_affinity : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; iree_hal_buffer_t* buffer = NULL; iree_status_t status = iree_hal_metal_buffer_wrap( + placement, #if defined(IREE_PLATFORM_MACOS) allocator->queue, #endif // IREE_PLATFORM_MACOS - metal_buffer, base_allocator, compat_params.type, compat_params.access, compat_params.usage, - allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, - iree_hal_buffer_release_callback_null(), &buffer); // +1 + metal_buffer, compat_params.type, compat_params.access, compat_params.usage, allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, iree_hal_buffer_release_callback_null(), + allocator->host_allocator, &buffer); // +1 if (iree_status_is_ok(status)) { IREE_TRACE_ALLOC_NAMED(IREE_HAL_METAL_ALLOCATOR_ID, (void*)iree_hal_metal_buffer_handle(buffer), @@ -336,13 +347,20 @@ static iree_status_t iree_hal_metal_allocator_import_host_buffer( return iree_make_status(IREE_STATUS_RESOURCE_EXHAUSTED, "unable to allocate buffer"); } - return iree_hal_metal_buffer_wrap( + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = + params->queue_affinity ? params->queue_affinity : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; + return iree_hal_metal_buffer_wrap(placement, #if defined(IREE_PLATFORM_MACOS) - allocator->queue, + allocator->queue, #endif // IREE_PLATFORM_MACOS - metal_buffer, base_allocator, params->type, params->access, params->usage, - external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, - release_callback, out_buffer); // +1 + metal_buffer, params->type, params->access, params->usage, + external_buffer->size, + /*byte_offset=*/0, /*byte_length=*/external_buffer->size, + release_callback, allocator->host_allocator, out_buffer); // +1 } static iree_status_t iree_hal_metal_allocator_import_device_buffer( @@ -352,7 +370,6 @@ static iree_status_t iree_hal_metal_allocator_import_device_buffer( iree_hal_buffer_release_callback_t release_callback, iree_hal_buffer_t** IREE_RESTRICT out_buffer) { iree_hal_metal_allocator_t* allocator = iree_hal_metal_allocator_cast(base_allocator); - (void)allocator; // Device allocation is an unowned MTLBuffer; we need to retain it to keep it live. id metal_buffer = @@ -363,13 +380,20 @@ static iree_status_t iree_hal_metal_allocator_import_device_buffer( // Wrap the externally-provided buffer in a HAL buffer handle that will retain the MTLBuffer until // it has been released. - return iree_hal_metal_buffer_wrap( + const iree_hal_buffer_placement_t placement = { + .device = allocator->parent_device, + .queue_affinity = + params->queue_affinity ? params->queue_affinity : IREE_HAL_QUEUE_AFFINITY_ANY, + .flags = IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; + return iree_hal_metal_buffer_wrap(placement, #if defined(IREE_PLATFORM_MACOS) - allocator->queue, + allocator->queue, #endif // IREE_PLATFORM_MACOS - metal_buffer, base_allocator, params->type, params->access, params->usage, - external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, - release_callback, out_buffer); // +1 + metal_buffer, params->type, params->access, params->usage, + external_buffer->size, /*byte_offset=*/0, + /*byte_length=*/external_buffer->size, release_callback, + allocator->host_allocator, out_buffer); // +1 } static iree_status_t iree_hal_metal_allocator_import_buffer( diff --git a/runtime/src/iree/hal/drivers/metal/metal_buffer.h b/runtime/src/iree/hal/drivers/metal/metal_buffer.h index 1a30fd3a5779..0607dc72e00b 100644 --- a/runtime/src/iree/hal/drivers/metal/metal_buffer.h +++ b/runtime/src/iree/hal/drivers/metal/metal_buffer.h @@ -20,15 +20,16 @@ extern "C" { // // |out_buffer| must be released by the caller (see iree_hal_buffer_release). iree_status_t iree_hal_metal_buffer_wrap( + iree_hal_buffer_placement_t placement, #if defined(IREE_PLATFORM_MACOS) id queue, #endif // IREE_PLATFORM_MACOS - id metal_buffer, iree_hal_allocator_t* allocator, - iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, + id metal_buffer, iree_hal_memory_type_t memory_type, + iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, iree_hal_buffer_release_callback_t release_callback, - iree_hal_buffer_t** out_buffer); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); // Returns true if the buffer was wrapped from an external handle instead of // allocated by the HAL allocator. diff --git a/runtime/src/iree/hal/drivers/metal/metal_buffer.m b/runtime/src/iree/hal/drivers/metal/metal_buffer.m index 2f22c842d485..b92ee9ec1d48 100644 --- a/runtime/src/iree/hal/drivers/metal/metal_buffer.m +++ b/runtime/src/iree/hal/drivers/metal/metal_buffer.m @@ -17,6 +17,7 @@ typedef struct iree_hal_metal_buffer_t { iree_hal_buffer_t base; + iree_allocator_t host_allocator; id buffer; // The command queue that we can use to issue commands to make buffer contents visible to CPU. #if defined(IREE_PLATFORM_MACOS) @@ -39,25 +40,26 @@ } iree_status_t iree_hal_metal_buffer_wrap( + iree_hal_buffer_placement_t placement, #if defined(IREE_PLATFORM_MACOS) id queue, #endif // IREE_PLATFORM_MACOS - id metal_buffer, iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + id metal_buffer, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, iree_hal_buffer_release_callback_t release_callback, - iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(allocator); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(placement.device); IREE_ASSERT_ARGUMENT(out_buffer); IREE_TRACE_ZONE_BEGIN(z0); - iree_allocator_t host_allocator = iree_hal_allocator_host_allocator(allocator); iree_hal_metal_buffer_t* buffer = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, allocation_size, - byte_offset, byte_length, memory_type, allowed_access, allowed_usage, + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, &iree_hal_metal_buffer_vtable, &buffer->base); + buffer->host_allocator = host_allocator; buffer->buffer = [metal_buffer retain]; // +1 #if defined(IREE_PLATFORM_MACOS) buffer->queue = queue; @@ -72,7 +74,7 @@ iree_status_t iree_hal_metal_buffer_wrap( static void iree_hal_metal_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_metal_buffer_t* buffer = iree_hal_metal_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)iree_hal_buffer_allocation_size(base_buffer)); diff --git a/runtime/src/iree/hal/drivers/metal/metal_device.m b/runtime/src/iree/hal/drivers/metal/metal_device.m index 4f8b4fd3e0bf..ef8e2c974465 100644 --- a/runtime/src/iree/hal/drivers/metal/metal_device.m +++ b/runtime/src/iree/hal/drivers/metal/metal_device.m @@ -122,7 +122,7 @@ static iree_status_t iree_hal_metal_device_create_internal( initWithDispatchQueue:device->semaphore_notification_queue]; // +1 device->capture_manager = NULL; - iree_status_t status = iree_hal_metal_allocator_create(metal_device, + iree_status_t status = iree_hal_metal_allocator_create((iree_hal_device_t*)device, metal_device, #if defined(IREE_PLATFORM_MACOS) metal_queue, #endif // IREE_PLATFORM_MACOS diff --git a/runtime/src/iree/hal/drivers/null/buffer.c b/runtime/src/iree/hal/drivers/null/buffer.c index 6e676526b1b5..fc52c02a2461 100644 --- a/runtime/src/iree/hal/drivers/null/buffer.c +++ b/runtime/src/iree/hal/drivers/null/buffer.c @@ -12,6 +12,7 @@ typedef struct iree_hal_null_buffer_t { iree_hal_buffer_t base; + iree_allocator_t host_allocator; iree_hal_buffer_release_callback_t release_callback; } iree_hal_null_buffer_t; @@ -30,7 +31,7 @@ static const iree_hal_null_buffer_t* iree_hal_null_buffer_const_cast( } iree_status_t iree_hal_null_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -44,10 +45,11 @@ iree_status_t iree_hal_null_buffer_wrap( IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer)); - iree_hal_buffer_initialize(host_allocator, allocator, &buffer->base, - allocation_size, byte_offset, byte_length, - memory_type, allowed_access, allowed_usage, + iree_hal_buffer_initialize(placement, &buffer->base, allocation_size, + byte_offset, byte_length, memory_type, + allowed_access, allowed_usage, &iree_hal_null_buffer_vtable, &buffer->base); + buffer->host_allocator = host_allocator; buffer->release_callback = release_callback; // TODO(null): retain or take ownership of provided handles/pointers/etc. @@ -68,7 +70,7 @@ iree_status_t iree_hal_null_buffer_wrap( static void iree_hal_null_buffer_destroy(iree_hal_buffer_t* base_buffer) { iree_hal_null_buffer_t* buffer = iree_hal_null_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->host_allocator; IREE_TRACE_ZONE_BEGIN(z0); // Optionally call a release callback when the buffer is destroyed. Not all diff --git a/runtime/src/iree/hal/drivers/null/buffer.h b/runtime/src/iree/hal/drivers/null/buffer.h index edf2e457e5e8..4befac509006 100644 --- a/runtime/src/iree/hal/drivers/null/buffer.h +++ b/runtime/src/iree/hal/drivers/null/buffer.h @@ -16,7 +16,7 @@ // Wraps a {Null} allocation in an iree_hal_buffer_t. iree_status_t iree_hal_null_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, diff --git a/runtime/src/iree/hal/drivers/vulkan/base_buffer.h b/runtime/src/iree/hal/drivers/vulkan/base_buffer.h index ea872b564886..7f7d798b3a50 100644 --- a/runtime/src/iree/hal/drivers/vulkan/base_buffer.h +++ b/runtime/src/iree/hal/drivers/vulkan/base_buffer.h @@ -98,6 +98,7 @@ iree_status_t iree_hal_vulkan_query_memory_heaps( // to get access to the API VkBuffer handle. typedef struct iree_hal_vulkan_base_buffer_t { iree_hal_buffer_t base; + iree_allocator_t host_allocator; // NOTE: may be VK_NULL_HANDLE if sparse residency is used to back the buffer // with multiple device memory allocations. VkDeviceMemory device_memory; diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc index 2114c5b5e4af..47f15d8b17ab 100644 --- a/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc +++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.cc @@ -28,6 +28,8 @@ typedef struct iree_hal_vulkan_native_allocator_t { iree_hal_resource_t resource; VkDeviceHandle* logical_device; iree_allocator_t host_allocator; + // Parent device that this allocator is associated with. Unowned. + iree_hal_device_t* parent_device; // Cached from the API to avoid additional queries in hot paths. VkPhysicalDeviceProperties device_props; @@ -56,9 +58,11 @@ static void iree_hal_vulkan_native_allocator_destroy( iree_hal_allocator_t* IREE_RESTRICT base_allocator); extern "C" iree_status_t iree_hal_vulkan_native_allocator_create( - const iree_hal_vulkan_device_options_t* options, VkInstance instance, + const iree_hal_vulkan_device_options_t* options, + iree_hal_device_t* parent_device, VkInstance instance, VkPhysicalDevice physical_device, VkDeviceHandle* logical_device, iree_hal_allocator_t** out_allocator) { + IREE_ASSERT_ARGUMENT(parent_device); IREE_ASSERT_ARGUMENT(instance); IREE_ASSERT_ARGUMENT(physical_device); IREE_ASSERT_ARGUMENT(logical_device); @@ -74,6 +78,7 @@ extern "C" iree_status_t iree_hal_vulkan_native_allocator_create( &allocator->resource); allocator->logical_device = logical_device; allocator->host_allocator = host_allocator; + allocator->parent_device = parent_device; const auto& syms = logical_device->syms(); @@ -266,6 +271,13 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( VkQueue queue = VK_NULL_HANDLE; logical_device->syms()->vkGetDeviceQueue(*logical_device, 0, 0, &queue); + const iree_hal_buffer_placement_t placement = { + /*.device=*/allocator->parent_device, + /*.queue_affinity=*/params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + /*.flags=*/IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; + // Ask Vulkan what the implementation requires of the allocation(s) for the // buffer. We should in most cases always get the same kind of values but // alignment and valid memory types will differ for dense and sparse buffers. @@ -284,11 +296,12 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( // to allocate such buffers (synchronously from raw allocations) but this // path is primarily used by large persistent variables and constants. return iree_hal_vulkan_sparse_buffer_create_bound_sync( - (iree_hal_allocator_t*)allocator, params->type, params->access, - params->usage, allocation_size, /*byte_offset=*/0, + placement, params->type, params->access, params->usage, allocation_size, + /*byte_offset=*/0, /*byte_length=*/allocation_size, logical_device, queue, handle, requirements, memory_type_index, - allocator->device_props_11.maxMemoryAllocationSize, out_buffer); + allocator->device_props_11.maxMemoryAllocationSize, + allocator->host_allocator, out_buffer); } // Allocate the device memory we'll attach the buffer to. @@ -321,12 +334,11 @@ static iree_status_t iree_hal_vulkan_native_allocator_commit_and_wrap( iree_hal_vulkan_native_allocator_native_buffer_release; internal_release_callback.user_data = NULL; iree_status_t status = iree_hal_vulkan_native_buffer_wrap( - (iree_hal_allocator_t*)allocator, params->type, params->access, - params->usage, allocation_size, + placement, params->type, params->access, params->usage, allocation_size, /*byte_offset=*/0, /*byte_length=*/allocation_size, logical_device, device_memory, handle, internal_release_callback, iree_hal_buffer_release_callback_null(), - out_buffer); + allocator->host_allocator, out_buffer); if (!iree_status_is_ok(status)) { logical_device->syms()->vkFreeMemory(*logical_device, device_memory, logical_device->allocator()); @@ -722,6 +734,12 @@ static iree_status_t iree_hal_vulkan_native_allocator_import_host_buffer( } // Wrap the device memory allocation and buffer handle in our own buffer type. + const iree_hal_buffer_placement_t placement = { + /*.device=*/allocator->parent_device, + /*.queue_affinity=*/params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + /*.flags=*/IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; iree_hal_vulkan_native_buffer_release_callback_t internal_release_callback = { 0}; internal_release_callback.fn = @@ -729,11 +747,12 @@ static iree_status_t iree_hal_vulkan_native_allocator_import_host_buffer( internal_release_callback.user_data = NULL; iree_hal_buffer_t* buffer = NULL; status = iree_hal_vulkan_native_buffer_wrap( - (iree_hal_allocator_t*)allocator, params->type, params->access, - params->usage, (iree_device_size_t)allocation_size, + placement, params->type, params->access, params->usage, + (iree_device_size_t)allocation_size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, logical_device, device_memory, - handle, internal_release_callback, release_callback, &buffer); + handle, internal_release_callback, release_callback, + allocator->host_allocator, &buffer); if (!iree_status_is_ok(status)) { logical_device->syms()->vkDestroyBuffer(*logical_device, handle, logical_device->allocator()); @@ -809,17 +828,24 @@ static iree_status_t iree_hal_vulkan_native_allocator_import_device_buffer( } // Wrap the device memory allocation and buffer handle in our own buffer type. + const iree_hal_buffer_placement_t placement = { + /*.device=*/allocator->parent_device, + /*.queue_affinity=*/params->queue_affinity ? params->queue_affinity + : IREE_HAL_QUEUE_AFFINITY_ANY, + /*.flags=*/IREE_HAL_BUFFER_PLACEMENT_FLAG_NONE, + }; iree_hal_vulkan_native_buffer_release_callback_t internal_release_callback = { 0}; internal_release_callback.fn = iree_hal_vulkan_native_allocator_external_device_buffer_release; internal_release_callback.user_data = NULL; return iree_hal_vulkan_native_buffer_wrap( - (iree_hal_allocator_t*)allocator, params->type, params->access, - params->usage, (iree_device_size_t)external_buffer->size, + placement, params->type, params->access, params->usage, + (iree_device_size_t)external_buffer->size, /*byte_offset=*/0, /*byte_length=*/external_buffer->size, logical_device, device_memory, - handle, internal_release_callback, release_callback, out_buffer); + handle, internal_release_callback, release_callback, + allocator->host_allocator, out_buffer); } static iree_status_t iree_hal_vulkan_native_allocator_import_buffer( diff --git a/runtime/src/iree/hal/drivers/vulkan/native_allocator.h b/runtime/src/iree/hal/drivers/vulkan/native_allocator.h index 147f1b9ceb67..481b7c0e5f77 100644 --- a/runtime/src/iree/hal/drivers/vulkan/native_allocator.h +++ b/runtime/src/iree/hal/drivers/vulkan/native_allocator.h @@ -18,7 +18,8 @@ extern "C" { // Creates a native Vulkan API-based allocator that directly allocates memory // from the underlying implementation with no pooling or suballocation. iree_status_t iree_hal_vulkan_native_allocator_create( - const iree_hal_vulkan_device_options_t* options, VkInstance instance, + const iree_hal_vulkan_device_options_t* options, + iree_hal_device_t* parent_device, VkInstance instance, VkPhysicalDevice physical_device, iree::hal::vulkan::VkDeviceHandle* logical_device, iree_hal_allocator_t** out_allocator); diff --git a/runtime/src/iree/hal/drivers/vulkan/native_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/native_buffer.cc index f0c0ec7b7b8a..6e7190098c7a 100644 --- a/runtime/src/iree/hal/drivers/vulkan/native_buffer.cc +++ b/runtime/src/iree/hal/drivers/vulkan/native_buffer.cc @@ -32,7 +32,7 @@ static iree_hal_vulkan_native_buffer_t* iree_hal_vulkan_native_buffer_cast( } iree_status_t iree_hal_vulkan_native_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -40,24 +40,23 @@ iree_status_t iree_hal_vulkan_native_buffer_wrap( VkDeviceMemory device_memory, VkBuffer handle, iree_hal_vulkan_native_buffer_release_callback_t internal_release_callback, iree_hal_buffer_release_callback_t user_release_callback, - iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(allocator); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(placement.device); IREE_ASSERT_ARGUMENT(logical_device); IREE_ASSERT_ARGUMENT(handle); IREE_ASSERT_ARGUMENT(out_buffer); IREE_TRACE_ZONE_BEGIN(z0); IREE_TRACE_ZONE_APPEND_VALUE_I64(z0, (int64_t)allocation_size); - iree_allocator_t host_allocator = - iree_hal_allocator_host_allocator(allocator); iree_hal_vulkan_native_buffer_t* buffer = NULL; iree_status_t status = iree_allocator_malloc(host_allocator, sizeof(*buffer), (void**)&buffer); if (iree_status_is_ok(status)) { iree_hal_buffer_initialize( - host_allocator, allocator, &buffer->base.base, allocation_size, - byte_offset, byte_length, memory_type, allowed_access, allowed_usage, + placement, &buffer->base.base, allocation_size, byte_offset, + byte_length, memory_type, allowed_access, allowed_usage, &iree_hal_vulkan_native_buffer_vtable, &buffer->base.base); + buffer->base.host_allocator = host_allocator; buffer->base.device_memory = device_memory; buffer->base.handle = handle; buffer->logical_device = logical_device; @@ -75,7 +74,7 @@ static void iree_hal_vulkan_native_buffer_destroy( iree_hal_buffer_t* base_buffer) { iree_hal_vulkan_native_buffer_t* buffer = iree_hal_vulkan_native_buffer_cast(base_buffer); - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->base.host_allocator; IREE_TRACE_ZONE_BEGIN(z0); IREE_TRACE_ZONE_APPEND_VALUE_I64( z0, (int64_t)iree_hal_buffer_allocation_size(base_buffer)); diff --git a/runtime/src/iree/hal/drivers/vulkan/native_buffer.h b/runtime/src/iree/hal/drivers/vulkan/native_buffer.h index 7d8a2738b43d..0049e2e9f958 100644 --- a/runtime/src/iree/hal/drivers/vulkan/native_buffer.h +++ b/runtime/src/iree/hal/drivers/vulkan/native_buffer.h @@ -35,7 +35,7 @@ typedef struct { // HAL. The provided callback is made when the buffer is destroyed to allow the // caller to clean up as appropriate. iree_status_t iree_hal_vulkan_native_buffer_wrap( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, @@ -43,7 +43,7 @@ iree_status_t iree_hal_vulkan_native_buffer_wrap( VkDeviceMemory device_memory, VkBuffer handle, iree_hal_vulkan_native_buffer_release_callback_t internal_release_callback, iree_hal_buffer_release_callback_t user_release_callback, - iree_hal_buffer_t** out_buffer); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); #ifdef __cplusplus } // extern "C" diff --git a/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.cc b/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.cc index 2ed3a8f2f9fb..b1f2778a65dc 100644 --- a/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.cc +++ b/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.cc @@ -123,15 +123,15 @@ static iree_status_t iree_hal_vulkan_sparse_buffer_commit_sync( } iree_status_t iree_hal_vulkan_sparse_buffer_create_bound_sync( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, VkBuffer handle, VkMemoryRequirements requirements, uint32_t memory_type_index, VkDeviceSize max_allocation_size, - iree_hal_buffer_t** out_buffer) { - IREE_ASSERT_ARGUMENT(allocator); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer) { + IREE_ASSERT_ARGUMENT(placement.device); IREE_ASSERT_ARGUMENT(logical_device); IREE_ASSERT_ARGUMENT(handle); IREE_ASSERT_ARGUMENT(out_buffer); @@ -151,8 +151,6 @@ iree_status_t iree_hal_vulkan_sparse_buffer_create_bound_sync( (iree_host_size_t)iree_device_size_ceil_div(requirements.size, physical_block_size); - iree_allocator_t host_allocator = - iree_hal_allocator_host_allocator(allocator); iree_hal_vulkan_sparse_buffer_t* buffer = NULL; iree_host_size_t total_size = iree_host_align(sizeof(*buffer), iree_max_align_t) + @@ -160,9 +158,10 @@ iree_status_t iree_hal_vulkan_sparse_buffer_create_bound_sync( IREE_RETURN_AND_END_ZONE_IF_ERROR( z0, iree_allocator_malloc(host_allocator, total_size, (void**)&buffer)); iree_hal_buffer_initialize( - host_allocator, allocator, &buffer->base.base, allocation_size, - byte_offset, byte_length, memory_type, allowed_access, allowed_usage, + placement, &buffer->base.base, allocation_size, byte_offset, byte_length, + memory_type, allowed_access, allowed_usage, &iree_hal_vulkan_sparse_buffer_vtable, &buffer->base.base); + buffer->base.host_allocator = host_allocator; buffer->base.handle = handle; buffer->logical_device = logical_device; buffer->physical_block_count = physical_block_count; @@ -187,7 +186,7 @@ static void iree_hal_vulkan_sparse_buffer_destroy( iree_hal_vulkan_sparse_buffer_t* buffer = iree_hal_vulkan_sparse_buffer_cast(base_buffer); iree::hal::vulkan::VkDeviceHandle* logical_device = buffer->logical_device; - iree_allocator_t host_allocator = base_buffer->host_allocator; + iree_allocator_t host_allocator = buffer->base.host_allocator; IREE_TRACE_ZONE_BEGIN(z0); IREE_TRACE_ZONE_APPEND_VALUE_I64( z0, (int64_t)iree_hal_buffer_allocation_size(base_buffer)); diff --git a/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.h b/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.h index 9ab7d6557910..a74397c170e5 100644 --- a/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.h +++ b/runtime/src/iree/hal/drivers/vulkan/sparse_buffer.h @@ -26,14 +26,14 @@ extern "C" { // This will eventually be replaced with HAL device APIs for controlling the // reserve/commit/decommit/release behavior of the virtual/physical storage. iree_status_t iree_hal_vulkan_sparse_buffer_create_bound_sync( - iree_hal_allocator_t* allocator, iree_hal_memory_type_t memory_type, + iree_hal_buffer_placement_t placement, iree_hal_memory_type_t memory_type, iree_hal_memory_access_t allowed_access, iree_hal_buffer_usage_t allowed_usage, iree_device_size_t allocation_size, iree_device_size_t byte_offset, iree_device_size_t byte_length, iree::hal::vulkan::VkDeviceHandle* logical_device, VkQueue queue, VkBuffer handle, VkMemoryRequirements requirements, uint32_t memory_type_index, VkDeviceSize max_allocation_size, - iree_hal_buffer_t** out_buffer); + iree_allocator_t host_allocator, iree_hal_buffer_t** out_buffer); #ifdef __cplusplus } // extern "C" diff --git a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc index 6db27bc47c61..043394292188 100644 --- a/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc +++ b/runtime/src/iree/hal/drivers/vulkan/vulkan_device.cc @@ -755,8 +755,8 @@ static iree_status_t iree_hal_vulkan_device_create_internal( // Create the device memory allocator that will service all buffer // allocation requests. iree_status_t status = iree_hal_vulkan_native_allocator_create( - options, instance, physical_device, logical_device, - &device->device_allocator); + options, (iree_hal_device_t*)device, instance, physical_device, + logical_device, &device->device_allocator); // Create command pools for each queue family. If we don't have a transfer // queue then we'll ignore that one and just use the dispatch pool. diff --git a/runtime/src/iree/hal/utils/caching_allocator.c b/runtime/src/iree/hal/utils/caching_allocator.c index 599f416949d8..8b4771c1b05d 100644 --- a/runtime/src/iree/hal/utils/caching_allocator.c +++ b/runtime/src/iree/hal/utils/caching_allocator.c @@ -730,7 +730,12 @@ static iree_status_t iree_hal_caching_allocator_allocate_buffer( pool, &compat_params, allocation_size, out_buffer)); // Point the buffer back to us for deallocation. - (*out_buffer)->device_allocator = base_allocator; + // + // TODO(#19159): remove iree_hal_allocator_deallocate_buffer when pooling no + // longer requires the pooling_allocator on iree_hal_buffer_t. We should + // instead be creating a new iree_hal_cached_buffer_t that we return as if it + // were an allocated buffer and that can store a reference back to the pool. + (*out_buffer)->pooling_allocator = base_allocator; return iree_ok_status(); } @@ -751,6 +756,9 @@ static void iree_hal_caching_allocator_deallocate_buffer( IREE_ASSERT(pool, "pool to return cached buffer to not found"); if (!pool) return; + // TODO(#19159): remove iree_hal_allocator_deallocate_buffer when pooling no + // longer requires the pooling_allocator on iree_hal_buffer_t. + // Release back to pool (which may deallocate). iree_hal_caching_allocator_pool_release(pool, buffer); } diff --git a/runtime/src/iree/modules/hal/inline/module.c b/runtime/src/iree/modules/hal/inline/module.c index c3c5deae5213..a8924535e319 100644 --- a/runtime/src/iree/modules/hal/inline/module.c +++ b/runtime/src/iree/modules/hal/inline/module.c @@ -366,7 +366,7 @@ IREE_VM_ABI_EXPORT(iree_hal_inline_module_buffer_subspan, // iree_hal_buffer_t* subspan_buffer = NULL; IREE_RETURN_IF_ERROR( iree_hal_buffer_subspan(source_buffer, source_offset, length, - &subspan_buffer), + state->host_allocator, &subspan_buffer), "invalid subspan of an existing buffer (source_offset=%" PRIdsz ", length=%" PRIdsz ")", source_offset, length); @@ -424,7 +424,7 @@ IREE_VM_ABI_EXPORT(iree_hal_inline_module_buffer_view_create, // source_length != iree_hal_buffer_byte_length(source_buffer)) { IREE_RETURN_IF_ERROR( iree_hal_buffer_subspan(source_buffer, source_offset, source_length, - &subspan_buffer), + state->host_allocator, &subspan_buffer), "invalid subspan of an existing buffer (source_offset=%" PRIdsz ", length=%" PRIdsz ")", source_offset, source_length); diff --git a/runtime/src/iree/modules/hal/module.c b/runtime/src/iree/modules/hal/module.c index 3baa27dd91e4..24a11dbb0997 100644 --- a/runtime/src/iree/modules/hal/module.c +++ b/runtime/src/iree/modules/hal/module.c @@ -462,7 +462,7 @@ IREE_VM_ABI_EXPORT(iree_hal_module_buffer_subspan, // iree_hal_buffer_t* subspan_buffer = NULL; IREE_RETURN_IF_ERROR( iree_hal_buffer_subspan(source_buffer, source_offset, length, - &subspan_buffer), + state->host_allocator, &subspan_buffer), "invalid subspan of an existing buffer (source_offset=%" PRIdsz ", length=%" PRIdsz ")", source_offset, length); @@ -549,7 +549,7 @@ IREE_VM_ABI_EXPORT(iree_hal_module_buffer_view_create, // source_length != iree_hal_buffer_byte_length(source_buffer)) { IREE_RETURN_IF_ERROR( iree_hal_buffer_subspan(source_buffer, source_offset, source_length, - &subspan_buffer), + state->host_allocator, &subspan_buffer), "invalid subspan of an existing buffer (source_offset=%" PRIdsz ", length=%" PRIdsz ")", source_offset, source_length); diff --git a/runtime/src/iree/tooling/function_io.c b/runtime/src/iree/tooling/function_io.c index d416a0dfd7ce..c38a4cf414ab 100644 --- a/runtime/src/iree/tooling/function_io.c +++ b/runtime/src/iree/tooling/function_io.c @@ -906,18 +906,18 @@ static iree_status_t iree_tooling_create_buffer_view_with_vm_buffer( vm_buffer, 0, iree_vm_buffer_length(vm_buffer), 1, &span)); // Wrap the heap memory in a HAL buffer for read-only access. - iree_hal_buffer_release_callback_t release_callback = { + const iree_hal_buffer_release_callback_t release_callback = { .fn = iree_hal_buffer_release_vm_buffer, .user_data = vm_buffer, }; iree_vm_buffer_retain(vm_buffer); iree_hal_buffer_t* hal_buffer = NULL; iree_status_t status = iree_hal_heap_buffer_wrap( - device_allocator, IREE_HAL_MEMORY_TYPE_HOST_LOCAL, + iree_hal_buffer_placement_undefined(), IREE_HAL_MEMORY_TYPE_HOST_LOCAL, IREE_HAL_MEMORY_ACCESS_READ, IREE_HAL_BUFFER_USAGE_TRANSFER_SOURCE | IREE_HAL_BUFFER_USAGE_MAPPING, span.data_length, iree_cast_const_byte_span(span), release_callback, - &hal_buffer); + host_allocator, &hal_buffer); iree_vm_buffer_release(vm_buffer); // Wrap the HAL buffer in a buffer view. @@ -931,15 +931,14 @@ static iree_status_t iree_tooling_create_buffer_view_with_vm_buffer( } static iree_status_t iree_tooling_create_buffer_view_empty( - iree_hal_allocator_t* device_allocator, iree_allocator_t host_allocator, - iree_hal_buffer_view_t** out_buffer_view) { + iree_allocator_t host_allocator, iree_hal_buffer_view_t** out_buffer_view) { iree_hal_buffer_t* hal_buffer = NULL; IREE_RETURN_IF_ERROR(iree_hal_heap_buffer_wrap( - device_allocator, IREE_HAL_MEMORY_TYPE_HOST_LOCAL, + iree_hal_buffer_placement_undefined(), IREE_HAL_MEMORY_TYPE_HOST_LOCAL, IREE_HAL_MEMORY_ACCESS_READ, IREE_HAL_BUFFER_USAGE_TRANSFER_SOURCE | IREE_HAL_BUFFER_USAGE_MAPPING, 0, iree_byte_span_empty(), iree_hal_buffer_release_callback_null(), - &hal_buffer)); + host_allocator, &hal_buffer)); iree_status_t status = iree_tooling_create_buffer_view_with_hal_buffer( hal_buffer, host_allocator, out_buffer_view); iree_hal_buffer_release(hal_buffer); @@ -953,8 +952,8 @@ static iree_status_t iree_tooling_create_buffer_view_with_value( iree_hal_element_type_t element_type = IREE_HAL_ELEMENT_TYPE_NONE; switch (value.type) { case IREE_VM_VALUE_TYPE_NONE: - return iree_tooling_create_buffer_view_empty( - device_allocator, host_allocator, out_buffer_view); + return iree_tooling_create_buffer_view_empty(host_allocator, + out_buffer_view); case IREE_VM_VALUE_TYPE_I8: byte_length = sizeof(value.i8); element_type = IREE_HAL_ELEMENT_TYPE_INT_8; @@ -1015,8 +1014,8 @@ static iree_status_t iree_tooling_create_buffer_view_from_variant( if (iree_vm_variant_is_empty(variant)) { // Empty value - we need to emit a zero-length value to keep the npy file // ordered when there are multiple entries. - return iree_tooling_create_buffer_view_empty( - device_allocator, host_allocator, out_buffer_view); + return iree_tooling_create_buffer_view_empty(host_allocator, + out_buffer_view); } else if (iree_vm_variant_is_ref(variant)) { if (iree_hal_buffer_view_isa(variant.ref)) { // Buffer view returned can provide the metadata required.