[v3 2/9] drm/amdgpu/mes: add front end for detect and reset hung queue
Jesse.Zhang
Jesse.Zhang at amd.com
Tue Jun 17 09:20:14 UTC 2025
From: Alex Deucher <alexander.deucher at amd.com>
Helper function to detect and reset hung queues. MES will
return an array of doorbell indices of which queues are hung
and were optionally reset.
Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c | 62 +++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h | 19 ++++++++
2 files changed, 81 insertions(+)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
index 6fa9fa11c8f3..c1c7064237fb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.c
@@ -191,6 +191,20 @@ int amdgpu_mes_init(struct amdgpu_device *adev)
if (r)
goto error_doorbell;
+ if (adev->mes.hung_queue_db_array_size) {
+ r = amdgpu_bo_create_kernel(adev,
+ adev->mes.hung_queue_db_array_size * sizeof(u32),
+ PAGE_SIZE,
+ AMDGPU_GEM_DOMAIN_GTT,
+ &adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+ if (r) {
+ dev_warn(adev->dev, "failed to create MES hung db array buffer (%d)", r);
+ goto error_doorbell;
+ }
+ }
+
return 0;
error_doorbell:
@@ -216,6 +230,10 @@ void amdgpu_mes_fini(struct amdgpu_device *adev)
{
int i;
+ amdgpu_bo_free_kernel(&adev->mes.hung_queue_db_array_gpu_obj,
+ &adev->mes.hung_queue_db_array_gpu_addr,
+ &adev->mes.hung_queue_db_array_cpu_addr);
+
amdgpu_bo_free_kernel(&adev->mes.event_log_gpu_obj,
&adev->mes.event_log_gpu_addr,
&adev->mes.event_log_cpu_addr);
@@ -366,6 +384,50 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
return r;
}
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev)
+{
+ return adev->mes.hung_queue_db_array_size;
+}
+
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+ int queue_type,
+ bool detect_only,
+ unsigned int *hung_db_num,
+ u32 *hung_db_array)
+
+{
+ struct mes_detect_and_reset_queue_input input;
+ u32 *db_array = adev->mes.hung_queue_db_array_cpu_addr;
+ int r, i;
+
+ if (!hung_db_num || !hung_db_array)
+ return -EINVAL;
+
+ if ((queue_type != AMDGPU_RING_TYPE_GFX) &&
+ (queue_type != AMDGPU_RING_TYPE_COMPUTE) &&
+ (queue_type != AMDGPU_RING_TYPE_SDMA))
+ return -EINVAL;
+
+ input.queue_type = queue_type;
+ input.detect_only = detect_only;
+
+ r = adev->mes.funcs->detect_and_reset_hung_queues(&adev->mes,
+ &input);
+ if (r) {
+ dev_err(adev->dev, "failed to detect and reset\n");
+ } else {
+ *hung_db_num = 0;
+ for (i = 0; i < adev->mes.hung_queue_db_array_size; i++) {
+ if (db_array[i] != AMDGPU_MES_INVALID_DB_OFFSET) {
+ hung_db_array[i] = db_array[i];
+ *hung_db_num += 1;
+ }
+ }
+ }
+
+ return r;
+}
+
uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg)
{
struct mes_misc_op_input op_input;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
index c0d2c195fe2e..2c4568951edb 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_mes.h
@@ -41,6 +41,7 @@
#define AMDGPU_MES_API_VERSION_MASK 0x00fff000
#define AMDGPU_MES_FEAT_VERSION_MASK 0xff000000
#define AMDGPU_MES_MSCRATCH_SIZE 0x40000
+#define AMDGPU_MES_INVALID_DB_OFFSET 0xffffffff
enum amdgpu_mes_priority_level {
AMDGPU_MES_PRIORITY_LEVEL_LOW = 0,
@@ -147,6 +148,10 @@ struct amdgpu_mes {
uint64_t resource_1_gpu_addr[AMDGPU_MAX_MES_PIPES];
void *resource_1_addr[AMDGPU_MAX_MES_PIPES];
+ int hung_queue_db_array_size;
+ struct amdgpu_bo *hung_queue_db_array_gpu_obj;
+ uint64_t hung_queue_db_array_gpu_addr;
+ void *hung_queue_db_array_cpu_addr;
};
struct amdgpu_mes_gang {
@@ -280,6 +285,11 @@ struct mes_reset_queue_input {
bool is_kq;
};
+struct mes_detect_and_reset_queue_input {
+ uint32_t queue_type;
+ bool detect_only;
+};
+
enum mes_misc_opcode {
MES_MISC_OP_WRITE_REG,
MES_MISC_OP_READ_REG,
@@ -367,6 +377,8 @@ struct amdgpu_mes_funcs {
int (*reset_hw_queue)(struct amdgpu_mes *mes,
struct mes_reset_queue_input *input);
+ int (*detect_and_reset_hung_queues)(struct amdgpu_mes *mes,
+ struct mes_detect_and_reset_queue_input *input);
};
#define amdgpu_mes_kiq_hw_init(adev) (adev)->mes.kiq_hw_init((adev))
@@ -390,6 +402,13 @@ int amdgpu_mes_reset_legacy_queue(struct amdgpu_device *adev,
unsigned int vmid,
bool use_mmio);
+int amdgpu_mes_get_hung_queue_db_array_size(struct amdgpu_device *adev);
+int amdgpu_mes_detect_and_reset_hung_queues(struct amdgpu_device *adev,
+ int queue_type,
+ bool detect_only,
+ unsigned int *hung_db_num,
+ u32 *hung_db_array);
+
uint32_t amdgpu_mes_rreg(struct amdgpu_device *adev, uint32_t reg);
int amdgpu_mes_wreg(struct amdgpu_device *adev,
uint32_t reg, uint32_t val);
--
2.49.0
More information about the amd-gfx
mailing list