Skip to content

Commit

Permalink
drm/amdkfd: APIs to stop/start KFD scheduling
Browse files Browse the repository at this point in the history
Provide amdgpu_amdkfd_stop_sched() for amdgpu to stop KFD scheduling
compute work on HIQ. amdgpu_amdkfd_start_sched() resumes the scheduling.
When amdgpu_amdkfd_stop_sched is called, KFD will unmap queues from
runlist. If users send ioctls to KFD to create queues, they'll be added
but those queues won't be mapped to runlist (so not scheduled) until
amdgpu_amdkfd_start_sched is called.

v2: fix build (Alex)

Signed-off-by: Amber Lin <[email protected]>
Signed-off-by: Alex Deucher <[email protected]>
  • Loading branch information
ascollard authored and alexdeucher committed Aug 21, 2024
1 parent b1f49ff commit 234eebe
Show file tree
Hide file tree
Showing 5 changed files with 137 additions and 1 deletion.
18 changes: 18 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.c
Original file line number Diff line number Diff line change
Expand Up @@ -887,3 +887,21 @@ int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,

return r;
}

/* Stop scheduling on KFD */
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id)
{
if (!adev->kfd.init_complete)
return 0;

return kgd2kfd_stop_sched(adev->kfd.dev, node_id);
}

/* Start scheduling on KFD */
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id)
{
if (!adev->kfd.init_complete)
return 0;

return kgd2kfd_start_sched(adev->kfd.dev, node_id);
}
14 changes: 14 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -264,6 +264,8 @@ int amdgpu_amdkfd_send_close_event_drain_irq(struct amdgpu_device *adev,
uint32_t *payload);
int amdgpu_amdkfd_unmap_hiq(struct amdgpu_device *adev, u32 doorbell_off,
u32 inst);
int amdgpu_amdkfd_start_sched(struct amdgpu_device *adev, uint32_t node_id);
int amdgpu_amdkfd_stop_sched(struct amdgpu_device *adev, uint32_t node_id);

/* Read user wptr from a specified user address space with page fault
* disabled. The memory must be pinned and mapped to the hardware when
Expand Down Expand Up @@ -426,6 +428,8 @@ void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint64_t throttle_bitmask);
int kgd2kfd_check_and_lock_kfd(void);
void kgd2kfd_unlock_kfd(void);
int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id);
int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id);
#else
static inline int kgd2kfd_init(void)
{
Expand Down Expand Up @@ -496,5 +500,15 @@ static inline int kgd2kfd_check_and_lock_kfd(void)
static inline void kgd2kfd_unlock_kfd(void)
{
}

static inline int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
{
return 0;
}

static inline int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
return 0;
}
#endif
#endif /* AMDGPU_AMDKFD_H_INCLUDED */
39 changes: 39 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_device.c
Original file line number Diff line number Diff line change
Expand Up @@ -1446,6 +1446,45 @@ void kgd2kfd_unlock_kfd(void)
mutex_unlock(&kfd_processes_mutex);
}

int kgd2kfd_start_sched(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;
int ret;

if (!kfd->init_complete)
return 0;

if (node_id >= kfd->num_nodes) {
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
node_id, kfd->num_nodes - 1);
return -EINVAL;
}
node = kfd->nodes[node_id];

ret = node->dqm->ops.unhalt(node->dqm);
if (ret)
dev_err(kfd_device, "Error in starting scheduler\n");

return ret;
}

int kgd2kfd_stop_sched(struct kfd_dev *kfd, uint32_t node_id)
{
struct kfd_node *node;

if (!kfd->init_complete)
return 0;

if (node_id >= kfd->num_nodes) {
dev_warn(kfd->adev->dev, "Invalid node ID: %u exceeds %u\n",
node_id, kfd->num_nodes - 1);
return -EINVAL;
}

node = kfd->nodes[node_id];
return node->dqm->ops.halt(node->dqm);
}

#if defined(CONFIG_DEBUG_FS)

/* This function will send a package to HIQ to hang the HWS
Expand Down
58 changes: 57 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.c
Original file line number Diff line number Diff line change
Expand Up @@ -1679,6 +1679,60 @@ static int initialize_cpsch(struct device_queue_manager *dqm)
return 0;
}

/* halt_cpsch:
* Unmap queues so the schedule doesn't continue remaining jobs in the queue.
* Then set dqm->sched_halt so queues don't map to runlist until unhalt_cpsch
* is called.
*/
static int halt_cpsch(struct device_queue_manager *dqm)
{
int ret = 0;

dqm_lock(dqm);
if (!dqm->sched_running) {
dqm_unlock(dqm);
return 0;
}

WARN_ONCE(dqm->sched_halt, "Scheduling is already on halt\n");

if (!dqm->is_hws_hang) {
if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = unmap_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_ALL_QUEUES, 0,
USE_DEFAULT_GRACE_PERIOD, false);
else
ret = remove_all_queues_mes(dqm);
}
dqm->sched_halt = true;
dqm_unlock(dqm);

return ret;
}

/* unhalt_cpsch
* Unset dqm->sched_halt and map queues back to runlist
*/
static int unhalt_cpsch(struct device_queue_manager *dqm)
{
int ret = 0;

dqm_lock(dqm);
if (!dqm->sched_running || !dqm->sched_halt) {
WARN_ONCE(!dqm->sched_halt, "Scheduling is not on halt.\n");
dqm_unlock(dqm);
return 0;
}
dqm->sched_halt = false;
if (!dqm->dev->kfd->shared_resources.enable_mes)
ret = execute_queues_cpsch(dqm,
KFD_UNMAP_QUEUES_FILTER_DYNAMIC_QUEUES,
0, USE_DEFAULT_GRACE_PERIOD);
dqm_unlock(dqm);

return ret;
}

static int start_cpsch(struct device_queue_manager *dqm)
{
struct device *dev = dqm->dev->adev->dev;
Expand Down Expand Up @@ -1984,7 +2038,7 @@ static int map_queues_cpsch(struct device_queue_manager *dqm)
struct device *dev = dqm->dev->adev->dev;
int retval;

if (!dqm->sched_running)
if (!dqm->sched_running || dqm->sched_halt)
return 0;
if (dqm->active_queue_count <= 0 || dqm->processes_count <= 0)
return 0;
Expand Down Expand Up @@ -2727,6 +2781,8 @@ struct device_queue_manager *device_queue_manager_init(struct kfd_node *dev)
dqm->ops.initialize = initialize_cpsch;
dqm->ops.start = start_cpsch;
dqm->ops.stop = stop_cpsch;
dqm->ops.halt = halt_cpsch;
dqm->ops.unhalt = unhalt_cpsch;
dqm->ops.destroy_queue = destroy_queue_cpsch;
dqm->ops.update_queue = update_queue;
dqm->ops.register_process = register_process;
Expand Down
9 changes: 9 additions & 0 deletions drivers/gpu/drm/amd/amdkfd/kfd_device_queue_manager.h
Original file line number Diff line number Diff line change
Expand Up @@ -106,6 +106,12 @@ union GRBM_GFX_INDEX_BITS {
* @uninitialize: Destroys all the device queue manager resources allocated in
* initialize routine.
*
* @halt: This routine unmaps queues from runlist and set halt status to true
* so no more queues will be mapped to runlist until unhalt.
*
* @unhalt: This routine unset halt status to flase and maps queues back to
* runlist.
*
* @create_kernel_queue: Creates kernel queue. Used for debug queue.
*
* @destroy_kernel_queue: Destroys kernel queue. Used for debug queue.
Expand Down Expand Up @@ -153,6 +159,8 @@ struct device_queue_manager_ops {
int (*start)(struct device_queue_manager *dqm);
int (*stop)(struct device_queue_manager *dqm);
void (*uninitialize)(struct device_queue_manager *dqm);
int (*halt)(struct device_queue_manager *dqm);
int (*unhalt)(struct device_queue_manager *dqm);
int (*create_kernel_queue)(struct device_queue_manager *dqm,
struct kernel_queue *kq,
struct qcm_process_device *qpd);
Expand Down Expand Up @@ -264,6 +272,7 @@ struct device_queue_manager {
struct work_struct hw_exception_work;
struct kfd_mem_obj hiq_sdma_mqd;
bool sched_running;
bool sched_halt;

/* used for GFX 9.4.3 only */
uint32_t current_logical_xcc_start;
Expand Down

0 comments on commit 234eebe

Please sign in to comment.