[PATCH 4/4] drm/amdkfd: add reset lock protection for kfd entry functions
Dennis Li
Dennis.Li at amd.com
Thu Mar 18 07:23:39 UTC 2021
When doing GPU reset, try to block all kfd functions including
kfd ioctls and file close function, which maybe access hardware.
v2: fix a potential recursive locking issue
kfd_ioctl_dbg_register has chance called into pqm_create_queue, which
will cause recursive locking. So remove locking read_lock from process
queue manager, and add read_lock into related ioctls instead.
v3: put pqm_query_dev_by_qid under the protection of p->mutex
Signed-off-by: Dennis Li <Dennis.Li at amd.com>
Acked-by: Christian König <christian.koenig at amd.com>
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
index 6802c616e10e..283ba9435233 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
@@ -40,6 +40,7 @@
#include "kfd_dbgmgr.h"
#include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
+#include "amdgpu.h"
static long kfd_ioctl(struct file *, unsigned int, unsigned long);
static int kfd_open(struct inode *, struct file *);
@@ -298,6 +299,9 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
}
mutex_lock(&p->mutex);
+ err = amdgpu_read_lock(dev->ddev, true);
+ if (err)
+ goto err_read_lock;
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -326,6 +330,7 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
*/
args->doorbell_offset |= doorbell_offset_in_process;
+ amdgpu_read_unlock(dev->ddev);
mutex_unlock(&p->mutex);
pr_debug("Queue id %d was created successfully\n", args->queue_id);
@@ -343,6 +348,8 @@ static int kfd_ioctl_create_queue(struct file *filep, struct kfd_process *p,
err_create_queue:
err_bind_process:
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(&p->mutex);
return err;
}
@@ -352,6 +359,7 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
{
int retval;
struct kfd_ioctl_destroy_queue_args *args = data;
+ struct kfd_dev *dev;
pr_debug("Destroying queue id %d for pasid 0x%x\n",
args->queue_id,
@@ -359,8 +367,20 @@ static int kfd_ioctl_destroy_queue(struct file *filp, struct kfd_process *p,
mutex_lock(&p->mutex);
+ dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id);
+ if (!dev) {
+ retval = -EINVAL;
+ goto err_query_dev;
+ }
+
+ retval = amdgpu_read_lock(dev->ddev, true);
+ if (retval)
+ goto err_read_lock;
retval = pqm_destroy_queue(&p->pqm, args->queue_id);
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
+err_query_dev:
mutex_unlock(&p->mutex);
return retval;
}
@@ -371,6 +391,7 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
int retval;
struct kfd_ioctl_update_queue_args *args = data;
struct queue_properties properties;
+ struct kfd_dev *dev;
if (args->queue_percentage > KFD_MAX_QUEUE_PERCENTAGE) {
pr_err("Queue percentage must be between 0 to KFD_MAX_QUEUE_PERCENTAGE\n");
@@ -404,10 +425,21 @@ static int kfd_ioctl_update_queue(struct file *filp, struct kfd_process *p,
mutex_lock(&p->mutex);
+ dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id);
+ if (!dev) {
+ retval = -EINVAL;
+ goto err_query_dev;
+ }
+
+ retval = amdgpu_read_lock(dev->ddev, true);
+ if (retval)
+ goto err_read_lock;
retval = pqm_update_queue(&p->pqm, args->queue_id, &properties);
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
+err_query_dev:
mutex_unlock(&p->mutex);
-
return retval;
}
@@ -420,6 +452,7 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
struct queue_properties properties;
uint32_t __user *cu_mask_ptr = (uint32_t __user *)args->cu_mask_ptr;
size_t cu_mask_size = sizeof(uint32_t) * (args->num_cu_mask / 32);
+ struct kfd_dev *dev;
if ((args->num_cu_mask % 32) != 0) {
pr_debug("num_cu_mask 0x%x must be a multiple of 32",
@@ -456,8 +489,20 @@ static int kfd_ioctl_set_cu_mask(struct file *filp, struct kfd_process *p,
mutex_lock(&p->mutex);
+ dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id);
+ if (!dev) {
+ retval = -EINVAL;
+ goto err_query_dev;
+ }
+
+ retval = amdgpu_read_lock(dev->ddev, true);
+ if (retval)
+ goto err_read_lock;
retval = pqm_set_cu_mask(&p->pqm, args->queue_id, &properties);
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
+err_query_dev:
mutex_unlock(&p->mutex);
if (retval)
@@ -471,14 +516,27 @@ static int kfd_ioctl_get_queue_wave_state(struct file *filep,
{
struct kfd_ioctl_get_queue_wave_state_args *args = data;
int r;
+ struct kfd_dev *dev;
mutex_lock(&p->mutex);
+ dev = pqm_query_dev_by_qid(&p->pqm, args->queue_id);
+ if (!dev) {
+ r = -EINVAL;
+ goto err_query_dev;
+ }
+
+ r = amdgpu_read_lock(dev->ddev, true);
+ if (r)
+ goto err_read_lock;
r = pqm_get_wave_state(&p->pqm, args->queue_id,
(void __user *)args->ctl_stack_address,
&args->ctl_stack_used_size,
&args->save_area_used_size);
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
+err_query_dev:
mutex_unlock(&p->mutex);
return r;
@@ -509,6 +567,10 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
mutex_lock(&p->mutex);
+ err = amdgpu_read_lock(dev->ddev, true);
+ if (err)
+ goto err_read_lock;
+
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
err = -ESRCH;
@@ -531,6 +593,9 @@ static int kfd_ioctl_set_memory_policy(struct file *filep,
err = -EINVAL;
out:
+ amdgpu_read_unlock(dev->ddev);
+
+err_read_lock:
mutex_unlock(&p->mutex);
return err;
@@ -550,6 +615,10 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
mutex_lock(&p->mutex);
+ err = amdgpu_read_lock(dev->ddev, true);
+ if (err)
+ goto err_read_lock;
+
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
err = -ESRCH;
@@ -559,6 +628,9 @@ static int kfd_ioctl_set_trap_handler(struct file *filep,
kfd_process_set_trap_handler(&pdd->qpd, args->tba_addr, args->tma_addr);
out:
+ amdgpu_read_unlock(dev->ddev);
+
+err_read_lock:
mutex_unlock(&p->mutex);
return err;
@@ -584,6 +656,11 @@ static int kfd_ioctl_dbg_register(struct file *filep,
}
mutex_lock(&p->mutex);
+
+ status = amdgpu_read_lock(dev->ddev, true);
+ if (status)
+ goto err_read_lock;
+
mutex_lock(kfd_get_dbgmgr_mutex());
/*
@@ -613,6 +690,9 @@ static int kfd_ioctl_dbg_register(struct file *filep,
out:
mutex_unlock(kfd_get_dbgmgr_mutex());
+ amdgpu_read_unlock(dev->ddev);
+
+err_read_lock:
mutex_unlock(&p->mutex);
return status;
@@ -634,6 +714,10 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
return -EINVAL;
}
+ status = amdgpu_read_lock(dev->ddev, true);
+ if (status)
+ return status;
+
mutex_lock(kfd_get_dbgmgr_mutex());
status = kfd_dbgmgr_unregister(dev->dbgmgr, p);
@@ -644,6 +728,8 @@ static int kfd_ioctl_dbg_unregister(struct file *filep,
mutex_unlock(kfd_get_dbgmgr_mutex());
+ amdgpu_read_unlock(dev->ddev);
+
return status;
}
@@ -743,15 +829,19 @@ static int kfd_ioctl_dbg_address_watch(struct file *filep,
/* Currently HSA Event is not supported for DBG */
aw_info.watch_event = NULL;
+ status = amdgpu_read_lock(dev->ddev, true);
+ if (status)
+ goto out;
+
mutex_lock(kfd_get_dbgmgr_mutex());
status = kfd_dbgmgr_address_watch(dev->dbgmgr, &aw_info);
mutex_unlock(kfd_get_dbgmgr_mutex());
+ amdgpu_read_unlock(dev->ddev);
out:
kfree(args_buff);
-
return status;
}
@@ -822,6 +912,10 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
*((uint32_t *)(&args_buff[args_idx]));
wac_info.dbgWave_msg.MemoryVA = NULL;
+ status = amdgpu_read_lock(dev->ddev, true);
+ if (status)
+ goto pro_end;
+
mutex_lock(kfd_get_dbgmgr_mutex());
pr_debug("Calling dbg manager process %p, operand %u, mode %u, trapId %u, message %u\n",
@@ -835,6 +929,9 @@ static int kfd_ioctl_dbg_wave_control(struct file *filep,
mutex_unlock(kfd_get_dbgmgr_mutex());
+ amdgpu_read_unlock(dev->ddev);
+
+pro_end:
kfree(args_buff);
return status;
@@ -847,10 +944,11 @@ static int kfd_ioctl_get_clock_counters(struct file *filep,
struct kfd_dev *dev;
dev = kfd_device_by_id(args->gpu_id);
- if (dev)
+ if (dev && !amdgpu_read_lock(dev->ddev, true)) {
/* Reading GPU clock counter from KGD */
args->gpu_clock_counter = amdgpu_amdkfd_get_gpu_clock_counter(dev->kgd);
- else
+ amdgpu_read_unlock(dev->ddev);
+ } else
/* Node without GPU resource */
args->gpu_clock_counter = 0;
@@ -1056,13 +1154,20 @@ static int kfd_ioctl_create_event(struct file *filp, struct kfd_process *p,
}
mutex_unlock(&p->mutex);
+ err = amdgpu_read_lock(kfd->ddev, true);
+ if (err)
+ return err;
+
err = amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(kfd->kgd,
mem, &kern_addr, &size);
if (err) {
pr_err("Failed to map event page to kernel\n");
+ amdgpu_read_unlock(kfd->ddev);
return err;
}
+ amdgpu_read_unlock(kfd->ddev);
+
err = kfd_event_page_set(p, kern_addr, size);
if (err) {
pr_err("Failed to set event page\n");
@@ -1144,11 +1249,17 @@ static int kfd_ioctl_set_scratch_backing_va(struct file *filep,
mutex_unlock(&p->mutex);
+ err = amdgpu_read_lock(dev->ddev, true);
+ if (err)
+ return err;
+
if (dev->dqm->sched_policy == KFD_SCHED_POLICY_NO_HWS &&
pdd->qpd.vmid != 0 && dev->kfd2kgd->set_scratch_backing_va)
dev->kfd2kgd->set_scratch_backing_va(
dev->kgd, args->va_addr, pdd->qpd.vmid);
+ amdgpu_read_unlock(dev->ddev);
+
return 0;
bind_process_to_device_fail:
@@ -1217,6 +1328,10 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
mutex_lock(&p->mutex);
+ ret = amdgpu_read_lock(dev->ddev, true);
+ if (ret)
+ goto err_read_lock;
+
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) {
ret = -EINVAL;
@@ -1231,12 +1346,16 @@ static int kfd_ioctl_acquire_vm(struct file *filep, struct kfd_process *p,
ret = kfd_process_device_init_vm(pdd, drm_file);
if (ret)
goto err_unlock;
+
+ amdgpu_read_unlock(dev->ddev);
/* On success, the PDD keeps the drm_file reference */
mutex_unlock(&p->mutex);
return 0;
err_unlock:
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(&p->mutex);
fput(drm_file);
return ret;
@@ -1289,6 +1408,10 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
mutex_lock(&p->mutex);
+ err = amdgpu_read_lock(dev->ddev, true);
+ if (err)
+ goto err_read_lock;
+
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
err = PTR_ERR(pdd);
@@ -1331,6 +1454,7 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
if (flags & KFD_IOC_ALLOC_MEM_FLAGS_VRAM)
WRITE_ONCE(pdd->vram_usage, pdd->vram_usage + args->size);
+ amdgpu_read_unlock(dev->ddev);
mutex_unlock(&p->mutex);
args->handle = MAKE_HANDLE(args->gpu_id, idr_handle);
@@ -1348,6 +1472,8 @@ static int kfd_ioctl_alloc_memory_of_gpu(struct file *filep,
err_free:
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem, NULL);
err_unlock:
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(&p->mutex);
return err;
}
@@ -1368,6 +1494,10 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
mutex_lock(&p->mutex);
+ ret = amdgpu_read_lock(dev->ddev, true);
+ if (ret)
+ goto err_read_lock;
+
pdd = kfd_get_process_device_data(dev, p);
if (!pdd) {
pr_err("Process device data doesn't exist\n");
@@ -1395,6 +1525,8 @@ static int kfd_ioctl_free_memory_of_gpu(struct file *filep,
WRITE_ONCE(pdd->vram_usage, pdd->vram_usage - size);
err_unlock:
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(&p->mutex);
return ret;
}
@@ -1465,13 +1597,21 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
err = PTR_ERR(peer_pdd);
goto get_mem_obj_from_handle_failed;
}
+
+ err = amdgpu_read_lock(peer->ddev, true);
+ if (err)
+ goto map_memory_to_gpu_failed;
+
err = amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
if (err) {
pr_err("Failed to map to gpu %d/%d\n",
i, args->n_devices);
+ amdgpu_read_unlock(peer->ddev);
goto map_memory_to_gpu_failed;
}
+
+ amdgpu_read_unlock(peer->ddev);
args->n_success = i+1;
}
@@ -1491,7 +1631,10 @@ static int kfd_ioctl_map_memory_to_gpu(struct file *filep,
peer_pdd = kfd_get_process_device_data(peer, p);
if (WARN_ON_ONCE(!peer_pdd))
continue;
- kfd_flush_tlb(peer_pdd);
+ if (!amdgpu_read_lock(peer->ddev, true)) {
+ kfd_flush_tlb(peer_pdd);
+ amdgpu_read_unlock(peer->ddev);
+ }
}
kfree(devices_arr);
@@ -1572,13 +1715,20 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
err = -ENODEV;
goto get_mem_obj_from_handle_failed;
}
+
+ err = amdgpu_read_lock(peer->ddev, true);
+ if (err)
+ goto unmap_memory_from_gpu_failed;
+
err = amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
peer->kgd, (struct kgd_mem *)mem, peer_pdd->vm);
if (err) {
pr_err("Failed to unmap from gpu %d/%d\n",
i, args->n_devices);
+ amdgpu_read_unlock(peer->ddev);
goto unmap_memory_from_gpu_failed;
}
+ amdgpu_read_unlock(peer->ddev);
args->n_success = i+1;
}
kfree(devices_arr);
@@ -1624,7 +1774,13 @@ static int kfd_ioctl_alloc_queue_gws(struct file *filep,
goto out_unlock;
}
+ retval = amdgpu_read_lock(dev->ddev, true);
+ if (retval)
+ goto out_unlock;
+
retval = pqm_set_gws(&p->pqm, args->queue_id, args->num_gws ? dev->gws : NULL);
+
+ amdgpu_read_unlock(dev->ddev);
mutex_unlock(&p->mutex);
args->first_gws = 0;
@@ -1711,6 +1867,9 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
return PTR_ERR(dmabuf);
mutex_lock(&p->mutex);
+ r = amdgpu_read_lock(dev->ddev, true);
+ if (r)
+ goto err_read_lock;
pdd = kfd_bind_process_to_device(dev, p);
if (IS_ERR(pdd)) {
@@ -1731,6 +1890,7 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
goto err_free;
}
+ amdgpu_read_unlock(dev->ddev);
mutex_unlock(&p->mutex);
dma_buf_put(dmabuf);
@@ -1741,6 +1901,8 @@ static int kfd_ioctl_import_dmabuf(struct file *filep,
err_free:
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(dev->kgd, (struct kgd_mem *)mem, NULL);
err_unlock:
+ amdgpu_read_unlock(dev->ddev);
+err_read_lock:
mutex_unlock(&p->mutex);
dma_buf_put(dmabuf);
return r;
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
index d8c8b5ff449a..5ea25c7dff0d 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_priv.h
@@ -1011,7 +1011,8 @@ int pqm_get_wave_state(struct process_queue_manager *pqm,
void __user *ctl_stack,
u32 *ctl_stack_used_size,
u32 *save_area_used_size);
-
+struct kfd_dev *pqm_query_dev_by_qid(struct process_queue_manager *pqm,
+ unsigned int qid);
int amdkfd_fence_wait_timeout(unsigned int *fence_addr,
unsigned int fence_value,
unsigned int timeout_ms);
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process.c b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
index f5237997fa18..d02ca231ad83 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process.c
@@ -898,11 +898,15 @@ static void kfd_process_device_free_bos(struct kfd_process_device *pdd)
per_device_list) {
if (!peer_pdd->vm)
continue;
+ amdgpu_read_lock(peer_pdd->dev->ddev, false);
amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
peer_pdd->dev->kgd, mem, peer_pdd->vm);
+ amdgpu_read_unlock(peer_pdd->dev->ddev);
}
+ amdgpu_read_lock(pdd->dev->ddev, false);
amdgpu_amdkfd_gpuvm_free_memory_of_gpu(pdd->dev->kgd, mem, NULL);
+ amdgpu_read_unlock(pdd->dev->ddev);
kfd_process_device_remove_obj_handle(pdd, id);
}
}
diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
index eb1635ac8988..2b2308c0b006 100644
--- a/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
+++ b/drivers/gpu/drm/amd/amdkfd/kfd_process_queue_manager.c
@@ -64,6 +64,23 @@ static int find_available_queue_slot(struct process_queue_manager *pqm,
return 0;
}
+struct kfd_dev *pqm_query_dev_by_qid(struct process_queue_manager *pqm,
+ unsigned int qid)
+{
+ struct process_queue_node *pqn;
+
+ pqn = get_queue_by_qid(pqm, qid);
+ if (!pqn) {
+ pr_err("Queue id does not match any known queue\n");
+ return NULL;
+ }
+
+ if (pqn->q)
+ return pqn->q->device;
+
+ return NULL;
+}
+
void kfd_process_dequeue_from_device(struct kfd_process_device *pdd)
{
struct kfd_dev *dev = pdd->dev;
--
2.17.1
More information about the amd-gfx
mailing list