[v3 8/9] drm/amdgpu/userq: add a detect and reset callback
Jesse.Zhang
Jesse.Zhang at amd.com
Tue Jun 17 09:20:20 UTC 2025
From: Alex Deucher <alexander.deucher at amd.com>
Add a detect and reset callback and add the implementation
for mes. The callback will detect all hung queues of a
particular ip type (e.g., GFX or compute or SDMA) and
reset them.
v2: increase reset counter and set fence force completion (Jesse)
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 3 ++
drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 ++++++++++++++++++++++
2 files changed, 54 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
index ec040c2fd6c9..0335ff03f65f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
@@ -77,6 +77,9 @@ struct amdgpu_userq_funcs {
struct amdgpu_usermode_queue *queue);
int (*map)(struct amdgpu_userq_mgr *uq_mgr,
struct amdgpu_usermode_queue *queue);
+ int (*detect_and_reset)(struct amdgpu_device *adev,
+ int queue_type);
+
};
/* Usermode queues for gfx */
diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
index d6f50b13e2ba..52d438b5dcef 100644
--- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
+++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
@@ -21,6 +21,7 @@
* OTHER DEALINGS IN THE SOFTWARE.
*
*/
+#include <drm/drm_drv.h>
#include "amdgpu.h"
#include "amdgpu_gfx.h"
#include "mes_userqueue.h"
@@ -198,6 +199,55 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
return 0;
}
+static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
+ int queue_type)
+{
+ int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
+ struct mes_detect_and_reset_queue_input input;
+ struct amdgpu_usermode_queue *queue;
+ struct amdgpu_userq_mgr *uqm, *tmp;
+ unsigned int hung_db_num = 0;
+ int queue_id, r, i;
+ u32 db_array[4];
+
+ if (db_array_size > 4) {
+ dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
+ db_array_size);
+ return -EINVAL;
+ }
+
+ memset(&input, 0x0, sizeof(struct mes_detect_and_reset_queue_input));
+
+ input.queue_type = queue_type;
+
+ amdgpu_mes_lock(&adev->mes);
+ r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
+ &hung_db_num, db_array);
+ amdgpu_mes_unlock(&adev->mes);
+ if (r) {
+ dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
+ } else if (hung_db_num) {
+ mutex_lock(&adev->userq_mutex);
+ list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
+ idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
+ if (queue->queue_type == queue_type) {
+ for (i = 0; i < hung_db_num; i++) {
+ if (queue->doorbell_index == db_array[i]) {
+ queue->state = AMDGPU_USERQ_STATE_HUNG;
+ atomic_inc(&adev->gpu_reset_counter);
+ amdgpu_userq_fence_driver_force_completion(queue);
+ drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
+ }
+ }
+ }
+ }
+ }
+ mutex_unlock(&adev->userq_mutex);
+ }
+
+ return r;
+}
+
static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
struct drm_amdgpu_userq_in *args_in,
struct amdgpu_usermode_queue *queue)
@@ -352,4 +402,5 @@ const struct amdgpu_userq_funcs userq_mes_funcs = {
.mqd_destroy = mes_userq_mqd_destroy,
.unmap = mes_userq_unmap,
.map = mes_userq_map,
+ .detect_and_reset = mes_userq_detect_and_reset,
};
--
2.49.0
More information about the amd-gfx
mailing list