[PATCH 2/2] drm/amdgpu/sdma: guilty tracking is per instance

Lazar, Lijo lijo.lazar at amd.com
Mon Mar 17 14:34:07 UTC 2025



On 3/17/2025 7:59 PM, Deucher, Alexander wrote:
> [Public]
> 
>> -----Original Message-----
>> From: Lazar, Lijo <Lijo.Lazar at amd.com>
>> Sent: Monday, March 17, 2025 10:19 AM
>> To: Deucher, Alexander <Alexander.Deucher at amd.com>; amd-
>> gfx at lists.freedesktop.org
>> Subject: Re: [PATCH 2/2] drm/amdgpu/sdma: guilty tracking is per instance
>>
>>
>>
>> On 3/17/2025 7:17 PM, Alex Deucher wrote:
>>> The gfx and page queues are per instance, so track them per instance.
>>>
>>> Signed-off-by: Alex Deucher <alexander.deucher at amd.com>
>>> ---
>>>  drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h |  7 ++++---
>>> drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 18 +++++++++++-------
>>>  2 files changed, 15 insertions(+), 10 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
>>> index 8e7e794ff136f..dc1a81c2f9af7 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_sdma.h
>>> @@ -65,6 +65,10 @@ struct amdgpu_sdma_instance {
>>>     uint64_t                sdma_fw_gpu_addr;
>>>     uint32_t                *sdma_fw_ptr;
>>>     struct mutex            engine_reset_mutex;
>>> +   /* track guilty state of GFX and PAGE queues */
>>> +   bool                    gfx_guilty;
>>> +   bool                    page_guilty;
>>> +
>>>  };
>>>
>>>  enum amdgpu_sdma_ras_memory_id {
>>> @@ -127,9 +131,6 @@ struct amdgpu_sdma {
>>>     uint32_t                *ip_dump;
>>>     uint32_t                supported_reset;
>>>     struct list_head        reset_callback_list;
>>> -   /* track guilty state of GFX and PAGE queues */
>>> -   bool gfx_guilty;
>>> -   bool page_guilty;
>>>  };
>>>
>>>  /*
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>> b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>> index 927db7a080f0a..c485b582a20f6 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
>>> @@ -989,9 +989,10 @@ static int sdma_v4_4_2_inst_start(struct
>> amdgpu_device *adev,
>>>             uint32_t temp;
>>>
>>>             WREG32_SDMA(i, regSDMA_SEM_WAIT_FAIL_TIMER_CNTL, 0);
>>> -           sdma_v4_4_2_gfx_resume(adev, i, restore, adev->sdma.gfx_guilty);
>>> +           sdma_v4_4_2_gfx_resume(adev, i, restore,
>>> +adev->sdma.instance[i].gfx_guilty);
>>>             if (adev->sdma.has_page_queue)
>>> -                   sdma_v4_4_2_page_resume(adev, i, restore, adev-
>>> sdma.page_guilty);
>>> +                   sdma_v4_4_2_page_resume(adev, i, restore,
>>> +                                           adev->sdma.instance[i].page_guilty);
>>
>> I think passing the flag is no longer be required as the instance id is passed already.
> 
> We still need to determine which queue needs to be reset and restored vs.just reset.
> 

What I meant is it can be checked from within the function -
adev->sdma.instance[i].page_guilty. Second param identifies the instance.

Thanks,
Lijo

> Alex
> 
> 
>>
>> Thanks,
>> Lijo
>>
>>>
>>>             /* set utc l1 enable flag always to 1 */
>>>             temp = RREG32_SDMA(i, regSDMA_CNTL); @@ -1446,6
>> +1447,10 @@ static
>>> int sdma_v4_4_2_sw_init(struct amdgpu_ip_block *ip_block)
>>>
>>>     for (i = 0; i < adev->sdma.num_instances; i++) {
>>>             mutex_init(&adev->sdma.instance[i].engine_reset_mutex);
>>> +           /* Initialize guilty flags for GFX and PAGE queues */
>>> +           adev->sdma.instance[i].gfx_guilty = false;
>>> +           adev->sdma.instance[i].page_guilty = false;
>>> +
>>>             ring = &adev->sdma.instance[i].ring;
>>>             ring->ring_obj = NULL;
>>>             ring->use_doorbell = true;
>>> @@ -1507,9 +1512,6 @@ static int sdma_v4_4_2_sw_init(struct
>> amdgpu_ip_block *ip_block)
>>>     r = amdgpu_sdma_sysfs_reset_mask_init(adev);
>>>     if (r)
>>>             return r;
>>> -   /* Initialize guilty flags for GFX and PAGE queues */
>>> -   adev->sdma.gfx_guilty = false;
>>> -   adev->sdma.page_guilty = false;
>>>
>>>     return r;
>>>  }
>>> @@ -1686,9 +1688,11 @@ static int sdma_v4_4_2_stop_queue(struct
>> amdgpu_device *adev, uint32_t instance_
>>>             return -EINVAL;
>>>
>>>     /* Check if this queue is the guilty one */
>>> -   adev->sdma.gfx_guilty = sdma_v4_4_2_is_queue_selected(adev,
>> instance_id, false);
>>> +   adev->sdma.instance[instance_id].gfx_guilty =
>>> +           sdma_v4_4_2_is_queue_selected(adev, instance_id, false);
>>>     if (adev->sdma.has_page_queue)
>>> -           adev->sdma.page_guilty = sdma_v4_4_2_is_queue_selected(adev,
>> instance_id, true);
>>> +           adev->sdma.instance[instance_id].page_guilty =
>>> +                   sdma_v4_4_2_is_queue_selected(adev, instance_id, true);
>>>
>>>     /* Cache the rptr before reset, after the reset,
>>>     * all of the registers will be reset to 0
> 



More information about the amd-gfx mailing list