Skip to content

Commit

Permalink
rocr: Generic ISA targets support
Browse files Browse the repository at this point in the history
Change-Id: I6a0341ec9c1ec1e710143676b80a8a3c1a78f725
  • Loading branch information
cfreeamd committed Oct 28, 2024
1 parent 0869906 commit 0c18ff2
Show file tree
Hide file tree
Showing 17 changed files with 368 additions and 202 deletions.
20 changes: 18 additions & 2 deletions runtime/hsa-runtime/core/inc/agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -236,6 +236,18 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
hsa_status_t (*callback)(hsa_region_t region, void* data),
void* data) const = 0;

// @brief Invoke the user provided callback for each isa supported by
// this agent.
//
// @param [in] callback User provided callback function.
// @param [in] data User provided pointer as input for @p callback.
//
// @retval ::HSA_STATUS_SUCCESS if the callback function for each traversed
// isa returns ::HSA_STATUS_SUCCESS.
virtual hsa_status_t IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const = 0;

// @brief Invoke the callback for each cache useable by this agent.
virtual hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
void* data) const = 0;
Expand Down Expand Up @@ -278,8 +290,11 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
// @brief Returns an array of regions owned by the agent.
virtual const std::vector<const core::MemoryRegion*>& regions() const = 0;

// @details Returns the agent's instruction set architecture.
virtual const Isa* isa() const = 0;
// @brief Returns the ISA's supported by the agent.
// @details The returned vector is a list of pointers to the supported ISA,
// ordered from most specific (and performant) to most generic. For CPU
// and AIE agents, this list will be empty.
virtual const std::vector<const core::Isa *>& supported_isas() const = 0;

virtual uint64_t HiveId() const { return 0; }

Expand Down Expand Up @@ -343,6 +358,7 @@ class Agent : public Checked<0xF6BC25EB17E6F917> {
}

hsa_agent_t public_handle_;
std::vector<const core::Isa *> supported_isas_;

private:
// @brief Node id.
Expand Down
9 changes: 8 additions & 1 deletion runtime/hsa-runtime/core/inc/amd_aie_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -72,6 +72,10 @@ class AieAgent : public core::Agent {
void *data),
void *value) const override;

hsa_status_t IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const override;

hsa_status_t GetInfo(hsa_agent_info_t attribute, void *value) const override;

hsa_status_t QueueCreate(size_t size, hsa_queue_type32_t queue_type,
Expand All @@ -80,7 +84,10 @@ class AieAgent : public core::Agent {
uint32_t group_segment_size,
core::Queue **queue) override;

const core::Isa *isa() const override { return nullptr; }
// @brief Override from core::Agent.
const std::vector<const core::Isa*>& supported_isas() const override {
return supported_isas_;
}

const std::vector<const core::MemoryRegion *> &regions() const override {
return regions_;
Expand Down
11 changes: 8 additions & 3 deletions runtime/hsa-runtime/core/inc/amd_cpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ class CpuAgent : public core::Agent {
void* data),
void* data) const override;

hsa_status_t IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const override;

// @brief Override from core::Agent.
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
void* value) const override;
Expand Down Expand Up @@ -127,9 +131,10 @@ class CpuAgent : public core::Agent {
return regions_;
}

// @brief OVerride from core::Agent.
const core::Isa* isa() const override { return NULL; }

// @brief Override from core::Agent.
const std::vector<const core::Isa*>& supported_isas() const override {
return supported_isas_;
}
private:
// @brief Query the driver to get the region list owned by this agent.
void InitRegionList();
Expand Down
11 changes: 8 additions & 3 deletions runtime/hsa-runtime/core/inc/amd_gpu_agent.h
Original file line number Diff line number Diff line change
Expand Up @@ -275,6 +275,10 @@ class GpuAgent : public GpuAgentInt {
void* data),
void* data) const override;

hsa_status_t IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const override;

// @brief Override from core::Agent.
hsa_status_t IterateCache(hsa_status_t (*callback)(hsa_cache_t cache, void* data),
void* value) const override;
Expand Down Expand Up @@ -381,8 +385,8 @@ class GpuAgent : public GpuAgentInt {
return regions_;
}

// @brief Override from core::Agent.
const core::Isa* isa() const override { return isa_; }
const std::vector<const core::Isa *>& supported_isas() const override {
return supported_isas_;}

// @brief Override from AMD::GpuAgentInt.
__forceinline bool is_kv_device() const override { return is_kv_device_; }
Expand Down Expand Up @@ -432,7 +436,8 @@ class GpuAgent : public GpuAgentInt {
__forceinline bool AsyncScratchReclaimEnabled() const override {
// TODO: Need to update min CP FW ucode version once it is released
return (core::Runtime::runtime_singleton_->flag().enable_scratch_async_reclaim() &&
isa()->GetMajorVersion() == 9 && isa()->GetMinorVersion() == 4 &&
supported_isas()[0]->GetMajorVersion() == 9 &&
supported_isas()[0]->GetMinorVersion() == 4 &&
properties_.EngineId.ui32.uCode > 999);
};

Expand Down
6 changes: 1 addition & 5 deletions runtime/hsa-runtime/core/inc/amd_hsa_loader.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -163,11 +163,7 @@ class Context {

virtual hsa_isa_t IsaFromName(const char *name) = 0;

// This function will be deleted in a future patch. Use the overload
// that takes a generic version instead.
virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa) = 0;

virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa, unsigned genericVersion) { return IsaSupportedByAgent(agent, isa); }
virtual bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t isa, unsigned genericVersion) = 0;

virtual void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) = 0;

Expand Down
2 changes: 1 addition & 1 deletion runtime/hsa-runtime/core/inc/amd_loader_context.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ class LoaderContext final : public rocr::amd::hsa::loader::Context {

hsa_isa_t IsaFromName(const char *name) override;

bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t code_object_isa) override;
bool IsaSupportedByAgent(hsa_agent_t agent, hsa_isa_t code_object_isa, unsigned codeGenericVersion) override;

void* SegmentAlloc(amdgpu_hsa_elf_segment_t segment, hsa_agent_t agent, size_t size, size_t align, bool zero) override;

Expand Down
17 changes: 12 additions & 5 deletions runtime/hsa-runtime/core/inc/isa.h
Original file line number Diff line number Diff line change
Expand Up @@ -117,12 +117,16 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> {

/// @returns True if @p code_object_isa and @p agent_isa are compatible,
/// false otherwise.
static bool IsCompatible(const Isa &code_object_isa, const Isa &agent_isa);
static bool IsCompatible(const Isa &code_object_isa,
const Isa &agent_isa, unsigned int codeGenericVersion);

/// @returns This Isa's version.
const Version &GetVersion() const {
return version_;
}
/// @returns This Isa's generic target.
const std::string & GetIsaGeneric() const {return generic_;}


/// @returns SRAM ECC feature status.
IsaFeature GetSramecc() const {
Expand Down Expand Up @@ -188,13 +192,15 @@ class Isa final: public amd::hsa::common::Signed<0xB13594F2BD8F212D> {
private:
/// @brief Default constructor.
Isa()
: targetid_(nullptr),
version_(Version(-1, -1, -1)),
: version_(Version(-1, -1, -1)),
sramecc_(IsaFeature::Unsupported),
xnack_(IsaFeature::Unsupported) {}

// @brief Isa's target ID name.
const char* targetid_;
std::string targetid_;

// @brief Isa's generic version, if it exists. "" otherwise.
std::string generic_;

/// @brief Isa's version.
Version version_;
Expand Down Expand Up @@ -223,7 +229,8 @@ class IsaRegistry final {
static const Isa *GetIsa(const Isa::Version &version,
IsaFeature sramecc = IsaFeature::Any,
IsaFeature xnack = IsaFeature::Any);

static const std::unordered_map<std::string, unsigned int> &
GetSupportedGenericVersions();
private:
/// @brief IsaRegistry's map type.
typedef std::unordered_map<std::string, std::reference_wrapper<const Isa>> IsaMap;
Expand Down
11 changes: 11 additions & 0 deletions runtime/hsa-runtime/core/runtime/amd_aie_agent.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,6 +91,17 @@ hsa_status_t AieAgent::IterateCache(hsa_status_t (*callback)(hsa_cache_t cache,
return HSA_STATUS_ERROR_INVALID_CACHE;
}

hsa_status_t AieAgent::IterateSupportedIsas(
hsa_status_t (*callback)(hsa_isa_t isa, void* data),
void* data) const {
AMD::callback_t<decltype(callback)> call(callback);
for (const auto& isa : supported_isas()) {
hsa_status_t stat = call(core::Isa::Handle(isa), data);
if (stat != HSA_STATUS_SUCCESS) return stat;
}
return HSA_STATUS_SUCCESS;
}

hsa_status_t AieAgent::GetInfo(hsa_agent_info_t attribute, void *value) const {
const size_t attribute_ = static_cast<size_t>(attribute);

Expand Down
33 changes: 18 additions & 15 deletions runtime/hsa-runtime/core/runtime/amd_aql_queue.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -100,7 +100,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
// Values written to the HW doorbell are modulo the doubled size.
// This allows the HW to accept (doorbell == last_doorbell + queue_size).
// This workaround is required for GFXIP 7 and GFXIP 8 ASICs.
const core::Isa* isa = agent_->isa();
const core::Isa* isa = agent_->supported_isas()[0];
queue_full_workaround_ =
(isa->GetMajorVersion() == 7 || isa->GetMajorVersion() == 8)
? 1
Expand Down Expand Up @@ -208,7 +208,7 @@ AqlQueue::AqlQueue(GpuAgent* agent, size_t req_size_pkts, HSAuint32 node_id, Scr
assert(amd_queue_.private_segment_aperture_base_hi != 0 && "No private region found.");
}

if (agent_->isa()->GetMajorVersion() >= 11)
if (agent_->supported_isas()[0]->GetMajorVersion() >= 11)
queue_scratch_.mem_alignment_size = 256;
else
queue_scratch_.mem_alignment_size = 1024;
Expand Down Expand Up @@ -1016,8 +1016,9 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,

// For gfx10+ devices we must attempt to assign the smaller of 256 lanes or 16 groups to each
// engine.
if (agent_->isa()->GetMajorVersion() >= 10 && maxGroupsPerEngine < 16 &&
lanes_per_group * maxGroupsPerEngine < 256) {
if (agent_->supported_isas()[0]->GetMajorVersion() >= 10 &&
maxGroupsPerEngine < 16 &&
lanes_per_group * maxGroupsPerEngine < 256) {
uint64_t groups_per_interleave = (256 + lanes_per_group - 1) / lanes_per_group;
maxGroupsPerEngine = Min(groups_per_interleave, 16ul);
}
Expand Down Expand Up @@ -1118,7 +1119,7 @@ void AqlQueue::HandleInsufficientScratch(hsa_signal_value_t& error_code,
if (scratch.large) {
amd_queue_.queue_properties |= AMD_QUEUE_PROPERTIES_USE_SCRATCH_ONCE;
// Set system release fence to flush scratch stores with older firmware versions.
if ((agent_->isa()->GetMajorVersion() == 8) && (agent_->GetMicrocodeVersion() < 729)) {
if ((agent_->supported_isas()[0]->GetMajorVersion() == 8) && (agent_->GetMicrocodeVersion() < 729)) {
pkt->dispatch.header &=
~(((1 << HSA_PACKET_HEADER_WIDTH_SCRELEASE_FENCE_SCOPE) - 1)
<< HSA_PACKET_HEADER_SCRELEASE_FENCE_SCOPE);
Expand Down Expand Up @@ -1339,7 +1340,7 @@ bool AqlQueue::ExceptionHandler(hsa_signal_value_t error_code, void* arg) {
// Fallback if KFD does not support GPU core dump. In this case, there core dump is
// generated by hsa-runtime.
if (!core::Runtime::runtime_singleton_->KfdVersion().supports_core_dump &&
queue->agent_->isa()->GetMajorVersion() != 11) {
queue->agent_->supported_isas()[0]->GetMajorVersion() != 11) {

if (pcs::PcsRuntime::instance()->SessionsActive())
fprintf(stderr, "GPU core dump skipped because PC Sampling active\n");
Expand Down Expand Up @@ -1412,7 +1413,7 @@ hsa_status_t AqlQueue::SetCUMasking(uint32_t num_cu_mask_count, const uint32_t*
if ((!cu_mask_.empty()) || (num_cu_mask_count != 0) || (!global_mask.empty())) {

// Devices with WGPs must conform to even-indexed contiguous pairwise CU enablement.
if (agent_->isa()->GetMajorVersion() >= 10) {
if (agent_->supported_isas()[0]->GetMajorVersion() >= 10) {
for (int i = 0; i < mask.size() * 32; i += 2) {
uint32_t cu_pair = (mask[i / 32] >> (i % 32)) & 0x3;
if (cu_pair && cu_pair != 0x3) return HSA_STATUS_ERROR_INVALID_ARGUMENT;
Expand Down Expand Up @@ -1481,7 +1482,8 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
constexpr uint32_t ib_jump_size_dw = 4;

uint32_t ib_jump_cmd[ib_jump_size_dw] = {
PM4_HDR(PM4_HDR_IT_OPCODE_INDIRECT_BUFFER, ib_jump_size_dw, agent_->isa()->GetMajorVersion()),
PM4_HDR(PM4_HDR_IT_OPCODE_INDIRECT_BUFFER, ib_jump_size_dw,
agent_->supported_isas()[0]->GetMajorVersion()),
PM4_INDIRECT_BUFFER_DW1_IB_BASE_LO(uint32_t(uintptr_t(pm4_ib_buf_) >> 2)),
PM4_INDIRECT_BUFFER_DW2_IB_BASE_HI(uint32_t(uintptr_t(pm4_ib_buf_) >> 32)),
(PM4_INDIRECT_BUFFER_DW3_IB_SIZE(uint32_t(cmd_size_b / sizeof(uint32_t))) |
Expand All @@ -1493,7 +1495,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
hsa_signal_t local_signal = {0};
hsa_status_t err;

if (agent_->isa()->GetMajorVersion() <= 8) {
if (agent_->supported_isas()[0]->GetMajorVersion() <= 8) {
// Construct a set of PM4 to fit inside the AQL packet slot.
uint32_t slot_dw_idx = 0;

Expand All @@ -1504,7 +1506,8 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
uint32_t* nop_pad = &slot_data[slot_dw_idx];
slot_dw_idx += nop_pad_size_dw;

nop_pad[0] = PM4_HDR(PM4_HDR_IT_OPCODE_NOP, nop_pad_size_dw, agent_->isa()->GetMajorVersion());
nop_pad[0] = PM4_HDR(PM4_HDR_IT_OPCODE_NOP, nop_pad_size_dw,
agent_->supported_isas()[0]->GetMajorVersion());

for (uint32_t i = 1; i < nop_pad_size_dw; ++i) {
nop_pad[i] = 0;
Expand All @@ -1523,15 +1526,15 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
assert(slot_dw_idx + rel_mem_size_dw <= slot_size_dw && "PM4 exceeded queue slot size");
uint32_t* rel_mem = &slot_data[slot_dw_idx];

rel_mem[0] =
PM4_HDR(PM4_HDR_IT_OPCODE_RELEASE_MEM, rel_mem_size_dw, agent_->isa()->GetMajorVersion());
rel_mem[0] = PM4_HDR(PM4_HDR_IT_OPCODE_RELEASE_MEM, rel_mem_size_dw,
agent_->supported_isas()[0]->GetMajorVersion());
rel_mem[1] = PM4_RELEASE_MEM_DW1_EVENT_INDEX(PM4_RELEASE_MEM_EVENT_INDEX_AQL);
rel_mem[2] = 0;
rel_mem[3] = 0;
rel_mem[4] = 0;
rel_mem[5] = 0;
rel_mem[6] = 0;
} else if (agent_->isa()->GetMajorVersion() >= 9) {
} else if (agent_->supported_isas()[0]->GetMajorVersion() >= 9) {
// Construct an AQL packet to jump to the PM4 IB.
struct amd_aql_pm4_ib {
uint16_t header;
Expand Down Expand Up @@ -1582,7 +1585,7 @@ void AqlQueue::ExecutePM4(uint32_t* cmd_data, size_t cmd_size_b, hsa_fence_scope
doorbell->StoreRelease(write_idx);

// Wait for the packet to be consumed.
if (agent_->isa()->GetMajorVersion() <= 8) {
if (agent_->supported_isas()[0]->GetMajorVersion() <= 8) {
while (queue->LoadReadIndexRelaxed() <= write_idx)
os::YieldThread();

Expand Down Expand Up @@ -1863,7 +1866,7 @@ void AqlQueue::FillComputeTmpRingSize_Gfx12() {
// @brief Define the Scratch Buffer Descriptor and related parameters
// that enable kernel access scratch memory
void AqlQueue::InitScratchSRD() {
switch (agent_->isa()->GetMajorVersion()) {
switch (agent_->supported_isas()[0]->GetMajorVersion()) {
case 12:
FillBufRsrcWord0();
FillBufRsrcWord1_Gfx11();
Expand Down
25 changes: 14 additions & 11 deletions runtime/hsa-runtime/core/runtime/amd_blit_sdma.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -151,15 +151,16 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
}

// Some GFX9 devices require a minimum of 64 DWORDS per ring buffer submission.
if (agent_->isa()->GetVersion() >= core::Isa::Version(9, 0, 0) &&
(agent_->isa()->GetVersion() <= core::Isa::Version(9, 0, 4) ||
agent_->isa()->GetVersion() == core::Isa::Version(9, 0, 12))) {
if (agent_->supported_isas()[0]->GetVersion() >= core::Isa::Version(9, 0, 0) &&
(agent_->supported_isas()[0]->GetVersion() <= core::Isa::Version(9, 0, 4) ||
agent_->supported_isas()[0]->GetVersion() == core::Isa::Version(9, 0, 12))) {
min_submission_size_ = 256;
}

const core::Runtime::LinkInfo& link = core::Runtime::runtime_singleton_->GetLinkInfo(
agent_->node_id(), core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
if (agent_->isa()->GetVersion() == core::Isa::Version(7, 0, 1)) {
const core::Runtime::LinkInfo& link =
core::Runtime::runtime_singleton_->GetLinkInfo( agent_->node_id(),
core::Runtime::runtime_singleton_->cpu_agents()[0]->node_id());
if (agent_->supported_isas()[0]->GetVersion() == core::Isa::Version(7, 0, 1)) {
platform_atomic_support_ = false;
} else {
platform_atomic_support_ = link.info.atomic_support_64bit;
Expand All @@ -169,8 +170,8 @@ hsa_status_t BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>:
// gfx90a can support xGMI host to device connections so bypass HDP flush
// in this case.
// gfx101x seems to have issues with HDP flushes
if (agent_->isa()->GetMajorVersion() >= 9 &&
!(agent_->isa()->GetMajorVersion() == 10 && agent_->isa()->GetMinorVersion() == 1)) {
if (agent_->supported_isas()[0]->GetMajorVersion() >= 9 &&
!(agent_->supported_isas()[0]->GetMajorVersion() == 10 && agent_->supported_isas()[0]->GetMinorVersion() == 1)) {
hdp_flush_support_ = link.info.link_type != HSA_AMD_LINK_INFO_TYPE_XGMI;
}

Expand Down Expand Up @@ -556,7 +557,8 @@ BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::SubmitCopyRe
throw AMD::hsa_exception(HSA_STATUS_ERROR_INVALID_ARGUMENT, "Copy rect slice needed.");

// GFX12 or later use a different packet format that is incompatible (fields changed in size and location).
const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12);
const bool isGFX12Plus =
(agent_->supported_isas()[0]->GetMajorVersion() >= 12);

// Common and GFX12 packet must match in size to use same code for vector/append.
static_assert(sizeof(SDMA_PKT_COPY_LINEAR_RECT) == sizeof(SDMA_PKT_COPY_LINEAR_RECT_GFX12), "");
Expand Down Expand Up @@ -777,7 +779,7 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildFe

packet_addr->HEADER_UNION.op = SDMA_OP_FENCE;

if (agent_->isa()->GetMajorVersion() >= 10) {
if (agent_->supported_isas()[0]->GetMajorVersion() >= 10) {
packet_addr->HEADER_UNION.mtype = 3;
}

Expand Down Expand Up @@ -847,7 +849,8 @@ void BlitSdma<RingIndexTy, HwIndexMonotonic, SizeToCountOffset, useGCR>::BuildCo
};

// GFX12 or later use a different packet format that is incompatible (fields changed in size and location).
const bool isGFX12Plus = (agent_->isa()->GetMajorVersion() >= 12);
const bool isGFX12Plus =
(agent_->supported_isas()[0]->GetMajorVersion() >= 12);

// Limits in terms of element count
const uint32_t max_pitch = 1 << (isGFX12Plus ? SDMA_PKT_COPY_LINEAR_RECT_GFX12::pitch_bits : SDMA_PKT_COPY_LINEAR_RECT::pitch_bits);
Expand Down
Loading

0 comments on commit 0c18ff2

Please sign in to comment.