Skip to content

Commit

Permalink
SWDEV-453301 - Remove the option to write multiple packets in dispatc…
Browse files Browse the repository at this point in the history
…hGenericAqlPacket

Dispatching multiple packets with ring the doorbell once is not supported by the lower layers

Change-Id: I7665a2dcdd4ef9e47dadfe410180fed64c5a4ee0
  • Loading branch information
iassiour committed Apr 5, 2024
1 parent 880f1f0 commit d7f352d
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 61 deletions.
8 changes: 0 additions & 8 deletions rocclr/device/rocm/roccounters.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -431,14 +431,6 @@ PerfCounter::PerfCounter(const Device& device, //!< A ROC device object

// these block indices are valid for the SI (Gfx8) & Gfx9 devices
switch (roc_device_.isa().versionMajor()) {
case (8):
gfxVersion_ = ROC_GFX8;
if (blockIndex < viBlockIdOrcaToRocr.size()) {
auto p = viBlockIdOrcaToRocr[blockIndex];
event_.block_name = std::get<0>(p);
event_.block_index = std::get<1>(p);
}
break;
case (9):
gfxVersion_ = ROC_GFX9;
if (blockIndex < gfx9BlockIdOrcaToRocr.size()) {
Expand Down
1 change: 0 additions & 1 deletion rocclr/device/rocm/roccounters.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,6 @@ class PerfCounter : public device::PerfCounter {
public:
enum {
ROC_UNSUPPORTED = 0,
ROC_GFX8,
ROC_GFX9,
ROC_GFX10
};
Expand Down
86 changes: 36 additions & 50 deletions rocclr/device/rocm/rocvirtual.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -832,13 +832,13 @@ static inline void packet_store_release(uint32_t* packet, uint16_t header, uint1
// ================================================================================================
template <typename AqlPacket>
bool VirtualGPU::dispatchGenericAqlPacket(
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking, size_t size) {
AqlPacket* packet, uint16_t header, uint16_t rest, bool blocking) {
const uint32_t queueSize = gpu_queue_->size;
const uint32_t queueMask = queueSize - 1;
const uint32_t sw_queue_size = queueMask;

// Check for queue full and wait if needed.
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, size);
uint64_t index = hsa_queue_add_write_index_screlease(gpu_queue_, 1);
uint64_t read = hsa_queue_load_read_index_relaxed(gpu_queue_);
if (addSystemScope_) {
header &= ~(HSA_FENCE_SCOPE_AGENT << HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE |
Expand Down Expand Up @@ -887,43 +887,38 @@ bool VirtualGPU::dispatchGenericAqlPacket(
blocking = true;
}

// Insert packet(s)
// NOTE: need multiple packets to dispatch the performance counter
// packet blob of the legacy devices (gfx8)
for (uint i = 0; i < size; i++, index++, packet++) {
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
*aql_loc = *packet;
if (header != 0) {
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, "
"group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal, read,
index);
}

hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index - 1);
AqlPacket* aql_loc = &((AqlPacket*)(gpu_queue_->base_address))[index & queueMask];
*aql_loc = *packet;
if (header != 0) {
packet_store_release(reinterpret_cast<uint32_t*>(aql_loc), header, rest);
}
ClPrint(amd::LOG_DEBUG, amd::LOG_AQL,
"SWq=0x%zx, HWq=0x%zx, id=%d, Dispatch Header = "
"0x%x (type=%d, barrier=%d, acquire=%d, release=%d), "
"setup=%d, grid=[%zu, %zu, %zu], workgroup=[%zu, %zu, %zu], private_seg_size=%zu, "
"group_seg_size=%zu, kernel_obj=0x%zx, kernarg_address=0x%zx, completion_signal=0x%zx "
"rptr=%u, wptr=%u",
gpu_queue_, gpu_queue_->base_address, gpu_queue_->id, header,
extractAqlBits(header, HSA_PACKET_HEADER_TYPE, HSA_PACKET_HEADER_WIDTH_TYPE),
extractAqlBits(header, HSA_PACKET_HEADER_BARRIER, HSA_PACKET_HEADER_WIDTH_BARRIER),
extractAqlBits(header, HSA_PACKET_HEADER_SCACQUIRE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCACQUIRE_FENCE_SCOPE),
extractAqlBits(header, HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE,
HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE),
rest, reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->grid_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_x,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_y,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->workgroup_size_z,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->private_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->group_segment_size,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernel_object,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->kernarg_address,
reinterpret_cast<hsa_kernel_dispatch_packet_t*>(packet)->completion_signal, read,
index);

hsa_signal_store_screlease(gpu_queue_->doorbell_signal, index);

// Mark the flag indicating if a dispatch is outstanding.
// We are not waiting after every dispatch.
Expand Down Expand Up @@ -984,9 +979,8 @@ inline bool VirtualGPU::dispatchAqlPacket(uint8_t* aqlpacket, amd::AccumulateCom
profilingBegin(*vcmd, true);
}
dispatchBlockingWait();
constexpr size_t kPacketSize = 1;
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlpacket);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
if (vcmd != nullptr) {
profilingEnd(*vcmd);
}
Expand All @@ -1003,13 +997,6 @@ bool VirtualGPU::dispatchCounterAqlPacket(hsa_ext_amd_aql_pm4_packet_t* packet,
// In GFX8 the PM4 IB packet blob is writing directly to AQL queue
// In GFX9 the PM4 IB is submitting by AQL Vendor Specific packet and
switch (gfxVersion) {
case PerfCounter::ROC_GFX8:
{ // Create legacy devices PM4 data
hsa_ext_amd_aql_pm4_packet_t pm4Packet[SLOT_PM4_SIZE_AQLP];
extApi->hsa_ven_amd_aqlprofile_legacy_get_pm4(packet, static_cast<void*>(&pm4Packet[0]));
return dispatchGenericAqlPacket(&pm4Packet[0], 0, 0, blocking, SLOT_PM4_SIZE_AQLP);
}
break;
case PerfCounter::ROC_GFX9:
case PerfCounter::ROC_GFX10:
{
Expand Down Expand Up @@ -3468,9 +3455,8 @@ void VirtualGPU::submitAccumulate(amd::AccumulateCommand& vcmd) {
uint8_t* aqlPacket = vcmd.getLastPacket();
if (aqlPacket != nullptr) {
dispatchBlockingWait();
constexpr size_t kPacketSize = 1;
auto packet = reinterpret_cast<hsa_kernel_dispatch_packet_t*>(aqlPacket);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false, kPacketSize);
dispatchGenericAqlPacket(packet, packet->header, packet->setup, false);
// We need to set fence_dirty_ flag as we would use a dispatch packet with a completion signal
// to track graph finish for the last. The sync logic assumes HW event to a barrier packet that
// has a system scope release. This would cause isFenceDirty() check at top level to insert
Expand Down
3 changes: 1 addition & 2 deletions rocclr/device/rocm/rocvirtual.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -433,8 +433,7 @@ class VirtualGPU : public device::VirtualDevice {
bool dispatchAqlPacket(hsa_barrier_and_packet_t* packet, uint16_t header,
uint16_t rest, bool blocking = true);
template <typename AqlPacket> bool dispatchGenericAqlPacket(AqlPacket* packet, uint16_t header,
uint16_t rest, bool blocking,
size_t size = 1);
uint16_t rest, bool blocking);

void dispatchBarrierPacket(uint16_t packetHeader, bool skipSignal = false,
hsa_signal_t signal = hsa_signal_t{0});
Expand Down

0 comments on commit d7f352d

Please sign in to comment.