[PATCH 5/7] drm/amdgpu/userq: implement resets
Alex Deucher
alexander.deucher at amd.com
Wed Apr 30 15:48:07 UTC 2025
If map or unmap fails, or a user fence times out, attempt to
reset the queue. If that fails, schedule a GPU reset.
Reviewed-by: Sunil Khatri <sunil.khatri at amd.com>
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 +
drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c | 128 +++++++++++++++++++--
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 5 +
4 files changed, 131 insertions(+), 11 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
index cc26cf1bd843e..936e1a0ac02f1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h
@@ -1251,6 +1251,7 @@ struct amdgpu_device {
struct list_head userq_mgr_list;
struct mutex userq_mutex;
bool userq_halt_for_enforce_isolation;
+ struct work_struct userq_reset_work;
};
static inline uint32_t amdgpu_ip_version(const struct amdgpu_device *adev,
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
index fe68ba9997ae4..317c86c1493a8 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
@@ -4420,6 +4420,7 @@ int amdgpu_device_init(struct amdgpu_device *adev,
}
INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
+ INIT_WORK(&adev->userq_reset_work, amdgpu_userq_reset_work);
adev->gfx.gfx_off_req_count = 1;
adev->gfx.gfx_off_residency = 0;
@@ -5777,6 +5778,10 @@ int amdgpu_device_reinit_after_reset(struct amdgpu_reset_context *reset_context)
if (r)
goto out;
+ r = amdgpu_userq_post_reset(tmp_adev, vram_lost);
+ if (r)
+ goto out;
+
drm_client_dev_resume(adev_to_drm(tmp_adev), false);
/*
@@ -5999,6 +6004,7 @@ static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
if (!amdgpu_sriov_vf(adev))
cancel_work(&adev->reset_work);
#endif
+ cancel_work(&adev->userq_reset_work);
if (adev->kfd.dev)
cancel_work(&adev->kfd.reset_work);
@@ -6109,6 +6115,8 @@ static int amdgpu_device_halt_activities(struct amdgpu_device *adev,
amdgpu_device_ip_need_full_reset(tmp_adev))
amdgpu_ras_suspend(tmp_adev);
+ amdgpu_userq_pre_reset(tmp_adev);
+
for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
struct amdgpu_ring *ring = tmp_adev->rings[i];
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
index afbe01149ed3f..4be46fa76ceba 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.c
@@ -26,7 +26,10 @@
#include <drm/drm_exec.h>
#include <linux/pm_runtime.h>
+#include <drm/drm_drv.h>
+
#include "amdgpu.h"
+#include "amdgpu_reset.h"
#include "amdgpu_vm.h"
#include "amdgpu_userq.h"
#include "amdgpu_userq_fence.h"
@@ -44,6 +47,44 @@ u32 amdgpu_userq_get_supported_ip_mask(struct amdgpu_device *adev)
return userq_ip_mask;
}
+static void amdgpu_userq_gpu_reset(struct amdgpu_device *adev)
+{
+
+ if (amdgpu_device_should_recover_gpu(adev))
+ amdgpu_reset_domain_schedule(adev->reset_domain,
+ &adev->userq_reset_work);
+}
+
+static bool
+amdgpu_userq_queue_reset_helper(struct amdgpu_userq_mgr *uq_mgr,
+ struct amdgpu_usermode_queue *queue)
+{
+ struct amdgpu_device *adev = uq_mgr->adev;
+ const struct amdgpu_userq_funcs *userq_funcs =
+ adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
+ int r;
+
+ if (unlikely(adev->debug_disable_gpu_ring_reset)) {
+ dev_err(adev->dev, "userq reset disabled by debug mask\n");
+ } else if (amdgpu_gpu_recovery && userq_funcs->reset) {
+ r = userq_funcs->reset(uq_mgr, queue);
+ if (r) {
+ dev_err(adev->dev, "userq reset failed\n");
+ gpu_reset = true;
+ } else {
+ dev_err(adev->dev, "userq reset succeeded\n");
+ atomic_inc(&adev->gpu_reset_counter);
+ amdgpu_userq_fence_driver_force_completion(queue);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+ }
+ } else if (amdgpu_gpu_recovery && !userq_funcs->reset) {
+ gpu_reset = true;
+ }
+
+ return gpu_reset;
+}
+
static int
amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue)
@@ -51,15 +92,22 @@ amdgpu_userq_unmap_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
r = userq_funcs->unmap(uq_mgr, queue);
- if (r)
+ if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
- else
+ gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, queue);
+ } else {
queue->state = AMDGPU_USERQ_STATE_UNMAPPED;
+ }
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -70,16 +118,22 @@ amdgpu_userq_map_helper(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_device *adev = uq_mgr->adev;
const struct amdgpu_userq_funcs *userq_funcs =
adev->userq_funcs[queue->queue_type];
+ bool gpu_reset = false;
int r = 0;
if (queue->state == AMDGPU_USERQ_STATE_UNMAPPED) {
r = userq_funcs->map(uq_mgr, queue);
if (r) {
queue->state = AMDGPU_USERQ_STATE_HUNG;
+ gpu_reset = amdgpu_userq_queue_reset_helper(uq_mgr, queue);
} else {
queue->state = AMDGPU_USERQ_STATE_MAPPED;
}
}
+
+ if (gpu_reset)
+ amdgpu_userq_gpu_reset(adev);
+
return r;
}
@@ -698,6 +752,23 @@ amdgpu_userq_evict_all(struct amdgpu_userq_mgr *uq_mgr)
return ret;
}
+void amdgpu_userq_reset_work(struct work_struct *work)
+{
+ struct amdgpu_device *adev = container_of(work, struct amdgpu_device,
+ userq_reset_work);
+ struct amdgpu_reset_context reset_context;
+
+ memset(&reset_context, 0, sizeof(reset_context));
+
+ reset_context.method = AMD_RESET_METHOD_NONE;
+ reset_context.reset_req_dev = adev;
+ reset_context.src = AMDGPU_RESET_SRC_USERQ;
+ set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
+ /*set_bit(AMDGPU_SKIP_COREDUMP, &reset_context.flags);*/
+
+ amdgpu_device_gpu_recover(adev, NULL, &reset_context);
+}
+
static int
amdgpu_userq_wait_for_signal(struct amdgpu_userq_mgr *uq_mgr)
{
@@ -724,22 +795,19 @@ void
amdgpu_userq_evict(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_eviction_fence *ev_fence)
{
- int ret;
struct amdgpu_fpriv *fpriv = uq_mgr_to_fpriv(uq_mgr);
struct amdgpu_eviction_fence_mgr *evf_mgr = &fpriv->evf_mgr;
+ struct amdgpu_device *adev = uq_mgr->adev;
+ int ret;
/* Wait for any pending userqueue fence work to finish */
ret = amdgpu_userq_wait_for_signal(uq_mgr);
- if (ret) {
- DRM_ERROR("Not evicting userqueue, timeout waiting for work\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Not evicting userqueue, timeout waiting for work\n");
ret = amdgpu_userq_evict_all(uq_mgr);
- if (ret) {
- DRM_ERROR("Failed to evict userqueue\n");
- return;
- }
+ if (ret)
+ dev_err(adev->dev, "Failed to evict userqueue\n");
/* Signal current eviction fence */
amdgpu_eviction_fence_signal(evf_mgr, ev_fence);
@@ -914,3 +982,41 @@ int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
mutex_unlock(&adev->userq_mutex);
return ret;
}
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev)
+{
+ const struct amdgpu_userq_funcs *userq_funcs;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ int queue_id;
+
+ mutex_lock(&adev->userq_mutex);
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ cancel_delayed_work_sync(&uqm->resume_work);
+ mutex_lock(&uqm->userq_mutex);
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->state == AMDGPU_USERQ_STATE_MAPPED) {
+ amdgpu_userq_wait_for_last_fence(uqm, queue);
+ userq_funcs = adev->userq_funcs[queue->queue_type];
+ userq_funcs->unmap(uqm, queue);
+ /* just mark all queues as hung at this point.
+ * if unmap succeeds, we could map again
+ * in amdgpu_userq_post_reset() if vram is not lost
+ */
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ amdgpu_userq_fence_driver_force_completion(queue);
+ }
+ }
+ mutex_unlock(&uqm->userq_mutex);
+ }
+ mutex_unlock(&adev->userq_mutex);
+}
+
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost)
+{
+ /* if any queue state is AMDGPU_USERQ_STATE_UNMAPPED
+ * at this point, we should be able to map it again
+ * and continue if vram is not lost.
+ */
+ return 0;
+}
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index 24d201cdc9887..6ede08dd821d2 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -132,4 +132,9 @@ int amdgpu_userq_stop_sched_for_enforce_isolation(struct amdgpu_device *adev,
int amdgpu_userq_start_sched_for_enforce_isolation(struct amdgpu_device *adev,
u32 idx);
+void amdgpu_userq_reset_work(struct work_struct *work);
+
+void amdgpu_userq_pre_reset(struct amdgpu_device *adev);
+int amdgpu_userq_post_reset(struct amdgpu_device *adev, bool vram_lost);
+
#endif
--
2.49.0
More information about the amd-gfx
mailing list