From 3baaa6e9c03cf50ecaa5a6b6cbdccd2a2b9e0051 Mon Sep 17 00:00:00 2001 From: Saleel Kudchadker Date: Tue, 20 Aug 2024 04:44:46 +0000 Subject: [PATCH] rocr: Allocate AQL queue on device memory - Use HSA_ALLOCATE_QUEUE_DEV_MEM=1 to create AQL queue in device memory. - Before writing AQL packet header to the queue use an SFENCE to ensure that there is no reodering of the writes over PCIE Change-Id: I5eacdc35108c4a1e245c75ae349b7495451aa60d --- .../core/driver/kfd/amd_kfd_driver.cpp | 5 ++++ runtime/hsa-runtime/core/inc/amd_gpu_agent.h | 25 +++++++++++------ runtime/hsa-runtime/core/inc/memory_region.h | 21 +++++++------- runtime/hsa-runtime/core/inc/queue.h | 8 ++++++ .../core/runtime/amd_aql_queue.cpp | 28 +++++++++++++++---- .../core/runtime/amd_blit_kernel.cpp | 4 +++ .../core/runtime/amd_gpu_agent.cpp | 10 +++++-- .../hsa-runtime/core/runtime/amd_topology.cpp | 14 ++-------- .../core/runtime/intercept_queue.cpp | 13 ++++++++- runtime/hsa-runtime/core/util/flag.h | 7 ++++- 10 files changed, 95 insertions(+), 40 deletions(-) diff --git a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp index 894c93343..0de256664 100644 --- a/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp +++ b/runtime/hsa-runtime/core/driver/kfd/amd_kfd_driver.cpp @@ -117,6 +117,11 @@ KfdDriver::AllocateMemory(const core::MemoryRegion &mem_region, ? 1 : kmt_alloc_flags.ui32.GTTAccess); + kmt_alloc_flags.ui32.Uncached = + (alloc_flags & core::MemoryRegion::AllocateUncached + ? 1 + : kmt_alloc_flags.ui32.Uncached); + if (m_region.IsLocalMemory()) { // Allocate physically contiguous memory. AllocateKfdMemory function call // will fail if this flag is not supported in KFD. diff --git a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h index 4b7c93ec2..7f2f96517 100644 --- a/runtime/hsa-runtime/core/inc/amd_gpu_agent.h +++ b/runtime/hsa-runtime/core/inc/amd_gpu_agent.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// +// // Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -348,7 +348,7 @@ class GpuAgent : public GpuAgentInt { } core::Agent* GetNearestCpuAgent(void) const; - + void RegisterGangPeer(core::Agent& gang_peer, unsigned int bandwidth_factor) override; void RegisterRecSdmaEngIdMaskPeer(core::Agent& gang_peer, uint32_t rec_sdma_eng_id_mask) override; @@ -417,6 +417,9 @@ class GpuAgent : public GpuAgentInt { if (t0_.GPUClockCounter == t1_.GPUClockCounter) SyncClocks(); } + // @brief Override from AMD::GpuAgentInt. + __forceinline bool is_xgmi_cpu_gpu() const { return xgmi_cpu_gpu_; } + const size_t MAX_SCRATCH_APERTURE_PER_XCC = (1ULL << 32); size_t MaxScratchDevice() const { return properties_.NumXcc * MAX_SCRATCH_APERTURE_PER_XCC; } @@ -624,6 +627,7 @@ class GpuAgent : public GpuAgentInt { // @brief HDP flush registers hsa_amd_hdp_flush_t HDP_flush_ = {nullptr, nullptr}; + private: // @brief Query the driver to get the region list owned by this agent. void InitRegionList(); @@ -782,6 +786,9 @@ class GpuAgent : public GpuAgentInt { std::map rec_sdma_eng_id_peers_info_; bool uses_rec_sdma_eng_id_mask_; + + // @bried XGMI CPU<->GPU + bool xgmi_cpu_gpu_; }; } // namespace amd diff --git a/runtime/hsa-runtime/core/inc/memory_region.h b/runtime/hsa-runtime/core/inc/memory_region.h index 66acf3636..94ebf6730 100644 --- a/runtime/hsa-runtime/core/inc/memory_region.h +++ b/runtime/hsa-runtime/core/inc/memory_region.h @@ -2,24 +2,24 @@ // // The University of Illinois/NCSA // Open Source License (NCSA) -// -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. -// +// +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. +// // Developed by: -// +// // AMD Research and AMD HSA Software Development -// +// // Advanced Micro Devices, Inc. -// +// // www.amd.com -// +// // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to // deal with the Software without restriction, including without limitation // the rights to use, copy, modify, merge, publish, distribute, sublicense, // and/or sell copies of the Software, and to permit persons to whom the // Software is furnished to do so, subject to the following conditions: -// +// // - Redistributions of source code must retain the above copyright notice, // this list of conditions and the following disclaimers. // - Redistributions in binary form must reproduce the above copyright @@ -29,7 +29,7 @@ // nor the names of its contributors may be used to endorse or promote // products derived from this Software without specific prior written // permission. -// +// // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL @@ -104,7 +104,8 @@ class MemoryRegion : public Checked<0x9C961F19EE175BB3> { // Note: The node_id needs to be the node_id of the device even though this is allocating // system memory AllocateGTTAccess = (1 << 9), - AllocateContiguous = (1 << 10), // Physically contiguous memory + AllocateContiguous = (1 << 10), // Physically contiguous memory + AllocateUncached = (1 << 11), // Uncached memory }; typedef uint32_t AllocateFlags; diff --git a/runtime/hsa-runtime/core/inc/queue.h b/runtime/hsa-runtime/core/inc/queue.h index 45b6fbe24..43bbe13e7 100644 --- a/runtime/hsa-runtime/core/inc/queue.h +++ b/runtime/hsa-runtime/core/inc/queue.h @@ -182,11 +182,13 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { Queue(int mem_flags = 0) : LocalQueue(mem_flags), amd_queue_(queue()->amd_queue) { queue()->core_queue = this; public_handle_ = Convert(this); + pcie_write_ordering_ = false; } Queue(int agent_node_id, int mem_flags) : LocalQueue(agent_node_id, mem_flags), amd_queue_(queue()->amd_queue) { queue()->core_queue = this; public_handle_ = Convert(this); + pcie_write_ordering_ = false; } virtual ~Queue() {} @@ -385,6 +387,10 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { bool IsType(rtti_t id) { return _IsA(id); } + bool needsPcieOrdering() const { return pcie_write_ordering_; } + + void setPcieOrdering(bool val) { pcie_write_ordering_ = val; } + protected: static void set_public_handle(Queue* ptr, hsa_queue_t* handle) { ptr->do_set_public_handle(handle); @@ -405,6 +411,8 @@ class Queue : public Checked<0xFA3906A679F9DB49>, private LocalQueue { // HSA Queue ID - used to bind a unique ID static std::atomic hsa_queue_counter_; + bool pcie_write_ordering_; + DISALLOW_COPY_AND_ASSIGN(Queue); }; } // namespace core diff --git a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp index f8bec1d84..0a55dc9aa 100644 --- a/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp @@ -3,7 +3,7 @@ // The University of Illinois/NCSA // Open Source License (NCSA) // -// Copyright (c) 2014-2020, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // @@ -344,6 +344,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr if (!core::Runtime::runtime_singleton_->flag().cu_mask_skip_init()) SetCUMasking(0, nullptr); active_ = true; + setPcieOrdering(agent->is_xgmi_cpu_gpu()); PM4IBGuard.Dismiss(); RingGuard.Dismiss(); @@ -727,10 +728,15 @@ void AqlQueue::AllocRegisteredRingBuffer(uint32_t queue_size_pkts) { ring_buf_alloc_bytes_ = queue_size_pkts * sizeof(core::AqlPacket); assert(IsMultipleOf(ring_buf_alloc_bytes_, 4096) && "Ring buffer sizes must be 4KiB aligned."); - ring_buf_ = agent_->system_allocator()( - ring_buf_alloc_bytes_, 0x1000, - core::MemoryRegion::AllocateExecutable | - (queue_full_workaround_ ? core::MemoryRegion::AllocateDoubleMap : 0)); + if (core::Runtime::runtime_singleton_->flag().dev_mem_queue()) { + ring_buf_ = agent_->finegrain_allocator()(ring_buf_alloc_bytes_, + core::MemoryRegion::AllocateUncached); + } else { + ring_buf_ = agent_->system_allocator()( + ring_buf_alloc_bytes_, 0x1000, + core::MemoryRegion::AllocateExecutable | + (queue_full_workaround_ ? core::MemoryRegion::AllocateDoubleMap : 0)); + } assert(ring_buf_ != NULL && "AQL queue memory allocation failure"); @@ -751,7 +757,13 @@ void AqlQueue::FreeRegisteredRingBuffer() { (void*)(uintptr_t(ring_buf_) + (ring_buf_alloc_bytes_ / 2))); #endif } else { - agent_->system_deallocator()(ring_buf_); + if (ring_buf_) { + if (core::Runtime::runtime_singleton_->flag().dev_mem_queue()) { + agent_->finegrain_deallocator()(ring_buf_); + } else { + agent_->system_deallocator()(ring_buf_); + } + } } ring_buf_ = NULL; @@ -1542,6 +1554,10 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope // Overwrite the AQL invalid header (first dword) last. // This prevents the slot from being read until it's fully written. memcpy(&queue_slot[1], &slot_data[1], slot_size_b - sizeof(uint32_t)); + if (core::Runtime::runtime_singleton_->flag().dev_mem_queue() && !agent_->is_xgmi_cpu_gpu()) { + // Ensure the packet body is written as header may get reordered when writing over PCIE + _mm_sfence(); + } atomic::Store(&queue_slot[0], slot_data[0], std::memory_order_release); // Submit the packet slot. diff --git a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp index 81b930cc4..51814d077 100644 --- a/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_blit_kernel.cpp @@ -1256,6 +1256,10 @@ void BlitKernel::PopulateQueue(uint64_t index, uint64_t code_handle, void* args, std::atomic_thread_fence(std::memory_order_acquire); queue_buffer[index & queue_bitmask_] = packet; std::atomic_thread_fence(std::memory_order_release); + if (core::Runtime::runtime_singleton_->flag().dev_mem_queue() && !queue_->needsPcieOrdering()) { + // Ensure the packet body is written as header may get reordered when writing over PCIE + _mm_sfence(); + } queue_buffer[index & queue_bitmask_].header = kDispatchPacketHeader; LogPrint(HSA_AMD_LOG_FLAG_BLIT_KERNEL_PKTS, diff --git a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp index ceb11e8d8..cf139b4ca 100644 --- a/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_gpu_agent.cpp @@ -118,7 +118,8 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna scratch_cache_( [this](void* base, size_t size, bool large) { ReleaseScratch(base, size, large); }), trap_handler_tma_region_(NULL), - pcs_hosttrap_data_() { + pcs_hosttrap_data_(), + xgmi_cpu_gpu_(false) { const bool is_apu_node = (properties_.NumCPUCores > 0); profile_ = (is_apu_node) ? HSA_PROFILE_FULL : HSA_PROFILE_BASE; @@ -219,6 +220,11 @@ GpuAgent::GpuAgent(HSAuint32 node, const HsaNodeProperties& node_props, bool xna wallclock_frequency_ = uint64_t(info.gpu_counter_freq) * 1000ull; #endif + auto& firstCpu = core::Runtime::runtime_singleton_->cpu_agents()[0]; + auto linkInfo = core::Runtime::runtime_singleton_->GetLinkInfo(firstCpu->node_id(), + node_id()); + xgmi_cpu_gpu_ = (linkInfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI); + // Populate region list. InitRegionList(); @@ -574,7 +580,7 @@ void GpuAgent::ReserveScratch() { size_t reserved_sz = core::Runtime::runtime_singleton_->flag().scratch_single_limit(); if (reserved_sz > MaxScratchDevice()) { - fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n", + fprintf(stdout, "User specified scratch limit exceeds device limits (requested:%lu max:%lu)!\n", reserved_sz, MaxScratchDevice()); reserved_sz = MaxScratchDevice(); } diff --git a/runtime/hsa-runtime/core/runtime/amd_topology.cpp b/runtime/hsa-runtime/core/runtime/amd_topology.cpp index bffcc03c9..ee385ff9f 100644 --- a/runtime/hsa-runtime/core/runtime/amd_topology.cpp +++ b/runtime/hsa-runtime/core/runtime/amd_topology.cpp @@ -380,17 +380,6 @@ void BuildTopology() { } const_cast(core::Runtime::runtime_singleton_->flag()).parse_masks(maxGpu, maxCu); - // Temporary work-around, disable SDMA ganging on non-APUs in non-SPX modes - // Check xGMI APU status - bool isXgmiApu = false; - auto& firstCpu = core::Runtime::runtime_singleton_->cpu_agents()[0]; - for (auto& peer_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { - auto linfo = core::Runtime::runtime_singleton_->GetLinkInfo(firstCpu->node_id(), - peer_gpu->node_id()); - isXgmiApu = linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI; - if (isXgmiApu) break; - } - // Register destination agents that can SDMA gang copy for source agents for (auto& src_gpu : core::Runtime::runtime_singleton_->gpu_agents()) { uint32_t src_id = src_gpu->node_id(); @@ -406,6 +395,9 @@ void BuildTopology() { // Weigth of 15 - Direct GPU link in single partition mode // Weight of 41 - Inter-socket GPU link in multi-partition mode if (linfo.info.link_type == HSA_AMD_LINK_INFO_TYPE_XGMI) { + // Temporary work-around, disable SDMA ganging on non-APUs in non-SPX modes + // Check xGMI APU status + const bool isXgmiApu = reinterpret_cast(src_gpu)->is_xgmi_cpu_gpu(); if (linfo.info.numa_distance == 13 || linfo.info.numa_distance == 41) gang_factor = isXgmiApu ? 2 : 1; else if (linfo.info.numa_distance == 15 && linfo.info.min_bandwidth) diff --git a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp index 47598bb95..0d08db7c1 100644 --- a/runtime/hsa-runtime/core/runtime/intercept_queue.cpp +++ b/runtime/hsa-runtime/core/runtime/intercept_queue.cpp @@ -258,6 +258,10 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { // Submit barrier which will wake async queue processing. ring[barrier & mask].packet.body = {}; ring[barrier & mask].barrier_and.completion_signal = Signal::Convert(async_doorbell_); + if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) { + // Ensure the packet body is written as header may get reordered when writing over PCIE + _mm_sfence(); + } atomic::Store(&ring[barrier & mask].barrier_and.header, kBarrierHeader, std::memory_order_release); // Update the wrapped queue's doorbell so it knows there is a new packet in the queue. @@ -301,6 +305,10 @@ uint64_t InterceptQueue::Submit(const AqlPacket* packets, uint64_t count) { ++packets_index; } if (write_index != 0) { + if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) { + // Ensure the packet body is written as header may get reordered when writing over PCIE + _mm_sfence(); + } atomic::Store(&ring[write & mask].packet.header, packets[first_written_packet_index].packet.header, std::memory_order_release); HSA::hsa_signal_store_screlease(wrapped->amd_queue_.hsa_queue.doorbell_signal, @@ -366,7 +374,10 @@ void InterceptQueue::StoreRelaxed(hsa_signal_value_t value) { Cursor.pkt_index = i; auto& handler = interceptors[Cursor.interceptor_index]; handler.first(&ring[i & mask], 1, i, handler.second, PacketWriter); - + if (Runtime::runtime_singleton_->flag().dev_mem_queue() && !needsPcieOrdering()) { + // Ensure the packet body is written as header may get reordered when writing over PCIE + _mm_sfence(); + } // Invalidate consumed packet. atomic::Store(&ring[i & mask].packet.header, kInvalidHeader, std::memory_order_release); diff --git a/runtime/hsa-runtime/core/util/flag.h b/runtime/hsa-runtime/core/util/flag.h index 5028e78de..178d307fc 100644 --- a/runtime/hsa-runtime/core/util/flag.h +++ b/runtime/hsa-runtime/core/util/flag.h @@ -3,7 +3,7 @@ // The University of Illinois/NCSA // Open Source License (NCSA) // -// Copyright (c) 2014-2021, Advanced Micro Devices, Inc. All rights reserved. +// Copyright (c) 2014-2024, Advanced Micro Devices, Inc. All rights reserved. // // Developed by: // @@ -247,6 +247,9 @@ class Flag { // Will either rename to HSA_OVERRIDE_CPU_AFFINITY later or remove completely. var = os::GetEnvVar("HSA_OVERRIDE_CPU_AFFINITY_DEBUG"); override_cpu_affinity_ = (var == "0") ? false : true; + + var = os::GetEnvVar("HSA_ALLOCATE_QUEUE_DEV_MEM"); + dev_mem_queue_ = (var == "1") ? true : false; } void parse_masks(uint32_t maxGpu, uint32_t maxCU) { @@ -357,6 +360,7 @@ class Flag { size_t pc_sampling_max_device_buffer_size() const { return pc_sampling_max_device_buffer_size_; } + bool dev_mem_queue() const { return dev_mem_queue_; } private: bool check_flat_scratch_; bool enable_vm_fault_message_; @@ -385,6 +389,7 @@ class Flag { bool image_print_srd_; bool enable_mwaitx_; bool enable_ipc_mode_legacy_; + bool dev_mem_queue_; SDMA_OVERRIDE enable_sdma_; SDMA_OVERRIDE enable_peer_sdma_;