[v3 8/9] drm/amdgpu/userq: add a detect and reset callback
Lazar, Lijo
lijo.lazar at amd.com
Wed Jun 18 04:14:16 UTC 2025
On 6/18/2025 8:14 AM, Zhang, Jesse(Jie) wrote:
> [AMD Official Use Only - AMD Internal Distribution Only]
>
> -----Original Message-----
> From: Lazar, Lijo <Lijo.Lazar at amd.com>
> Sent: Tuesday, June 17, 2025 5:57 PM
> To: Zhang, Jesse(Jie) <Jesse.Zhang at amd.com>; amd-gfx at lists.freedesktop.org
> Cc: Deucher, Alexander <Alexander.Deucher at amd.com>; Koenig, Christian <Christian.Koenig at amd.com>
> Subject: Re: [v3 8/9] drm/amdgpu/userq: add a detect and reset callback
>
>
>
> On 6/17/2025 2:50 PM, Jesse.Zhang wrote:
>> From: Alex Deucher <alexander.deucher at amd.com>
>>
>> Add a detect and reset callback and add the implementation for mes.
>> The callback will detect all hung queues of a particular ip type
>> (e.g., GFX or compute or SDMA) and reset them.
>>
>> v2: increase reset counter and set fence force completion (Jesse)
>>
>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>> ---
>> drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h | 3 ++
>> drivers/gpu/drm/amd/amdgpu/mes_userqueue.c | 51 ++++++++++++++++++++++
>> 2 files changed, 54 insertions(+)
>>
>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> index ec040c2fd6c9..0335ff03f65f 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_userq.h
>> @@ -77,6 +77,9 @@ struct amdgpu_userq_funcs {
>> struct amdgpu_usermode_queue *queue);
>> int (*map)(struct amdgpu_userq_mgr *uq_mgr,
>> struct amdgpu_usermode_queue *queue);
>> + int (*detect_and_reset)(struct amdgpu_device *adev,
>> + int queue_type);
>> +
>> };
>>
>> /* Usermode queues for gfx */
>> diff --git a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> index d6f50b13e2ba..52d438b5dcef 100644
>> --- a/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> +++ b/drivers/gpu/drm/amd/amdgpu/mes_userqueue.c
>> @@ -21,6 +21,7 @@
>> * OTHER DEALINGS IN THE SOFTWARE.
>> *
>> */
>> +#include <drm/drm_drv.h>
>> #include "amdgpu.h"
>> #include "amdgpu_gfx.h"
>> #include "mes_userqueue.h"
>> @@ -198,6 +199,55 @@ static int mes_userq_create_ctx_space(struct amdgpu_userq_mgr *uq_mgr,
>> return 0;
>> }
>>
>> +static int mes_userq_detect_and_reset(struct amdgpu_device *adev,
>> + int queue_type)
>> +{
>> + int db_array_size = amdgpu_mes_get_hung_queue_db_array_size(adev);
>> + struct mes_detect_and_reset_queue_input input;
>> + struct amdgpu_usermode_queue *queue;
>> + struct amdgpu_userq_mgr *uqm, *tmp;
>> + unsigned int hung_db_num = 0;
>> + int queue_id, r, i;
>> + u32 db_array[4];
>> +
>> + if (db_array_size > 4) {
>> + dev_err(adev->dev, "DB array size (%d vs 4) too small\n",
>> + db_array_size);
>> + return -EINVAL;
>> + }
>> +
>> + memset(&input, 0x0, sizeof(struct
>> +mes_detect_and_reset_queue_input));
>> +
>> + input.queue_type = queue_type;
>> +
>> + amdgpu_mes_lock(&adev->mes);
>> + r = amdgpu_mes_detect_and_reset_hung_queues(adev, queue_type, false,
>> + &hung_db_num, db_array);
>> + amdgpu_mes_unlock(&adev->mes);
>> + if (r) {
>> + dev_err(adev->dev, "Failed to detect and reset queues, err (%d)\n", r);
>> + } else if (hung_db_num) {
>> + mutex_lock(&adev->userq_mutex);
>> + list_for_each_entry_safe(uqm, tmp, &adev->userq_mgr_list, list) {
>> + idr_for_each_entry(&uqm->userq_idr, queue, queue_id) {
>> + if (queue->queue_type == queue_type) {
>> + for (i = 0; i < hung_db_num; i++) {
>> + if (queue->doorbell_index == db_array[i]) {
>> + queue->state = AMDGPU_USERQ_STATE_HUNG;
>
> After a reset and force completion of work, why is the queue state maintained as hung? Does that mean no more work can be submitted even after reset? Where is this state checked?
> Detect and reset will be called at umap, and if the queue is hung, umap will be skipped.
Ok, this doesn't match with the inferred meaning of
amdgpu_mes_detect_and_*reset_hung_queues*. The API name gives the
impression that queue is reset and now it can be used to submit other
work. If the queue is not useful after a reset, why not remove it from
the list of active queues?
Thanks,
Lijo
> Thanks
> Jesse
>
> Thanks,
> Lijo
>
>> + atomic_inc(&adev->gpu_reset_counter);
>> + amdgpu_userq_fence_driver_force_completion(queue);
>> + drm_dev_wedged_event(adev_to_drm(adev), DRM_WEDGE_RECOVERY_NONE);
>> + }
>> + }
>> + }
>> + }
>> + }
>> + mutex_unlock(&adev->userq_mutex);
>> + }
>> +
>> + return r;
>> +}
>> +
>> static int mes_userq_mqd_create(struct amdgpu_userq_mgr *uq_mgr,
>> struct drm_amdgpu_userq_in *args_in,
>> struct amdgpu_usermode_queue *queue) @@ -352,4 +402,5 @@ const
>> struct amdgpu_userq_funcs userq_mes_funcs = {
>> .mqd_destroy = mes_userq_mqd_destroy,
>> .unmap = mes_userq_unmap,
>> .map = mes_userq_map,
>> + .detect_and_reset = mes_userq_detect_and_reset,
>> };
>
More information about the amd-gfx
mailing list