From a1fc9f584c4aaf8bc1ebfa459fc57a3f26a290d8 Mon Sep 17 00:00:00 2001 From: Philip Yang Date: Fri, 2 Aug 2024 11:28:45 -0400 Subject: [PATCH] drm/amdkfd: Handle queue destroy buffer access race Add helper function kfd_queue_unreference_buffers to reduce queue buffer refcount, separate it from release queue buffers. Because it is circular locking to hold dqm_lock to take vm lock, kfd_ioctl_destroy_queue should take vm lock, unreference queue buffers first, but not release queue buffers, to handle error in case failed to hold vm lock. Then hold dqm_lock to remove queue from queue list and then release queue buffers. Restore process worker restore queue hold dqm_lock, will always find the queue with valid queue buffers. v2 (Felix): - renamed kfd_queue_unreference_buffer(s) to kfd_queue_unref_bo_va(s) - added two FIXME comments for follow up Signed-off-by: Philip Yang Signed-off-by: Felix Kuehling Reviewed-by: Felix Kuehling Signed-off-by: Alex Deucher --- drivers/gpu/drm/amd/amdkfd/kfd_chardev.c | 1 + drivers/gpu/drm/amd/amdkfd/kfd_priv.h | 5 +- .../amd/amdkfd/kfd_process_queue_manager.c | 8 ++- drivers/gpu/drm/amd/amdkfd/kfd_queue.c | 66 ++++++++++++------- 4 files changed, 53 insertions(+), 27 deletions(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c index 0622ebd7e8ef..00350eccd571 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c @@ -400,6 +400,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p, return 0; err_create_queue: + kfd_queue_unref_bo_vas(pdd, &q_properties); kfd_queue_release_buffers(pdd, &q_properties); err_acquire_queue_buf: err_sdma_engine_id: diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h index 057d20446c31..f7c12d4f0abb 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h +++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h @@ -1298,9 +1298,12 @@ void print_queue_properties(struct queue_properties *q); void print_queue(struct queue *q); int kfd_queue_buffer_get(struct amdgpu_vm *vm, void __user *addr, struct amdgpu_bo **pbo, u64 expected_size); -void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +void kfd_queue_buffer_put(struct amdgpu_bo **bo); int kfd_queue_acquire_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties); +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo); +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties); void kfd_queue_ctx_save_restore_size(struct kfd_topology_device *dev); struct mqd_manager *mqd_manager_init_cik(enum KFD_MQD_TYPE type, diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c index f732ee35b531..20ea745729ee 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c @@ -217,6 +217,7 @@ void pqm_uninit(struct process_queue_manager *pqm) list_for_each_entry_safe(pqn, next, &pqm->queues, process_queue_list) { if (pqn->q) { pdd = kfd_get_process_device_data(pqn->q->device, pqm->process); + kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); } @@ -512,7 +513,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) } if (pqn->q) { - retval = kfd_queue_release_buffers(pdd, &pqn->q->properties); + retval = kfd_queue_unref_bo_vas(pdd, &pqn->q->properties); if (retval) goto err_destroy_queue; @@ -526,7 +527,7 @@ int pqm_destroy_queue(struct process_queue_manager *pqm, unsigned int qid) if (retval != -ETIME) goto err_destroy_queue; } - + kfd_queue_release_buffers(pdd, &pqn->q->properties); pqm_clean_queue_resource(pqm, pqn); uninit_queue(pqn->q); } @@ -579,7 +580,8 @@ int pqm_update_queue_properties(struct process_queue_manager *pqm, return -EFAULT; } - kfd_queue_buffer_put(vm, &pqn->q->properties.ring_bo); + kfd_queue_unref_bo_va(vm, &pqn->q->properties.ring_bo); + kfd_queue_buffer_put(&pqn->q->properties.ring_bo); amdgpu_bo_unreserve(vm->root.bo); pqn->q->properties.ring_bo = p->ring_bo; diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c index e0a073ae4a49..ad29634f8b44 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_queue.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_queue.c @@ -224,16 +224,9 @@ out_err: return -EINVAL; } -void kfd_queue_buffer_put(struct amdgpu_vm *vm, struct amdgpu_bo **bo) +/* FIXME: remove this function, just call amdgpu_bo_unref directly */ +void kfd_queue_buffer_put(struct amdgpu_bo **bo) { - if (*bo) { - struct amdgpu_bo_va *bo_va; - - bo_va = amdgpu_vm_bo_find(vm, *bo); - if (bo_va) - bo_va->queue_refcount--; - } - amdgpu_bo_unref(bo); } @@ -327,6 +320,10 @@ out_unreserve: out_err_unreserve: amdgpu_bo_unreserve(vm->root.bo); out_err_release: + /* FIXME: make a _locked version of this that can be called before + * dropping the VM reservation. + */ + kfd_queue_unref_bo_vas(pdd, properties); kfd_queue_release_buffers(pdd, properties); return err; } @@ -334,22 +331,13 @@ out_err_release: int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_properties *properties) { struct kfd_topology_device *topo_dev; - struct amdgpu_vm *vm; u32 total_cwsr_size; - int err; - vm = drm_priv_to_vm(pdd->drm_priv); - err = amdgpu_bo_reserve(vm->root.bo, false); - if (err) - return err; - - kfd_queue_buffer_put(vm, &properties->wptr_bo); - kfd_queue_buffer_put(vm, &properties->rptr_bo); - kfd_queue_buffer_put(vm, &properties->ring_bo); - kfd_queue_buffer_put(vm, &properties->eop_buf_bo); - kfd_queue_buffer_put(vm, &properties->cwsr_bo); - - amdgpu_bo_unreserve(vm->root.bo); + kfd_queue_buffer_put(&properties->wptr_bo); + kfd_queue_buffer_put(&properties->rptr_bo); + kfd_queue_buffer_put(&properties->ring_bo); + kfd_queue_buffer_put(&properties->eop_buf_bo); + kfd_queue_buffer_put(&properties->cwsr_bo); topo_dev = kfd_topology_device_by_id(pdd->dev->id); if (!topo_dev) @@ -362,6 +350,38 @@ int kfd_queue_release_buffers(struct kfd_process_device *pdd, struct queue_prope return 0; } +void kfd_queue_unref_bo_va(struct amdgpu_vm *vm, struct amdgpu_bo **bo) +{ + if (*bo) { + struct amdgpu_bo_va *bo_va; + + bo_va = amdgpu_vm_bo_find(vm, *bo); + if (bo_va && bo_va->queue_refcount) + bo_va->queue_refcount--; + } +} + +int kfd_queue_unref_bo_vas(struct kfd_process_device *pdd, + struct queue_properties *properties) +{ + struct amdgpu_vm *vm; + int err; + + vm = drm_priv_to_vm(pdd->drm_priv); + err = amdgpu_bo_reserve(vm->root.bo, false); + if (err) + return err; + + kfd_queue_unref_bo_va(vm, &properties->wptr_bo); + kfd_queue_unref_bo_va(vm, &properties->rptr_bo); + kfd_queue_unref_bo_va(vm, &properties->ring_bo); + kfd_queue_unref_bo_va(vm, &properties->eop_buf_bo); + kfd_queue_unref_bo_va(vm, &properties->cwsr_bo); + + amdgpu_bo_unreserve(vm->root.bo); + return 0; +} + #define SGPR_SIZE_PER_CU 0x4000 #define LDS_SIZE_PER_CU 0x10000 #define HWREG_SIZE_PER_CU 0x1000 -- 2.39.5