[PATCH 2/7] drm/amdgpu: Add ring reset callback for JPEG4_0_3

Fri Jan 31 16:55:34 UTC 2025

On 1/31/2025 10:19 PM, Christian König wrote:
> Am 31.01.25 um 17:40 schrieb Sundararaju, Sathishkumar:
>> Hi Christian,
>>
>>
>> On 1/31/2025 9:56 PM, Christian König wrote:
>>> Am 31.01.25 um 17:23 schrieb Sathishkumar S:
>>>> Add ring reset function callback for JPEG4_0_3 to
>>>> recover from job timeouts without a full gpu reset.
>>>>
>>>> Signed-off-by: Sathishkumar S <sathishkumar.sundararaju at amd.com>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 60 
>>>> ++++++++++++++++++++++--
>>>>   1 file changed, 57 insertions(+), 3 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c 
>>>> b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>>> index be0b3b4c8690..62d8628dccc5 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>>> @@ -204,9 +204,7 @@ static int jpeg_v4_0_3_sw_init(struct 
>>>> amdgpu_ip_block *ip_block)
>>>>       if (r)
>>>>           return r;
>>>>   -    /* TODO: Add queue reset mask when FW fully supports it */
>>>> -    adev->jpeg.supported_reset =
>>>> - amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
>>>> +    adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_PIPE;
>>>>       r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
>>>>       if (r)
>>>>           return r;
>>>> @@ -231,6 +229,7 @@ static int jpeg_v4_0_3_sw_fini(struct 
>>>> amdgpu_ip_block *ip_block)
>>>>           return r;
>>>>         amdgpu_jpeg_sysfs_reset_mask_fini(adev);
>>>> +
>>>>       r = amdgpu_jpeg_sw_fini(adev);
>>>>         return r;
>>>> @@ -1099,6 +1098,60 @@ static int 
>>>> jpeg_v4_0_3_process_interrupt(struct amdgpu_device *adev,
>>>>       return 0;
>>>>   }
>>>>   +static int jpeg_v4_0_3_wait_for_idle_on_inst(struct amdgpu_ring 
>>>> *ring)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +    struct amdgpu_ring *r;
>>>> +    int ret, j;
>>>> +
>>>> +    for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>>> +        r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>>> +        r->sched.ready = false;
>>>> +    }
>>>> +    /* publish update */
>>>> +    smp_rmb();
>>>
>>> Using smp_rmb() to publish the update is 100% incorrect.
>>>
>>> What exactly are you trying to do?
>> On JPEG4_0_3, there are possibly multiple contexts submitting jobs to 
>> all cores on the affected instance,
>> so I am changing sched_ready to false and using smp_rmb() to publish 
>> to other cores which are trying to
>> submit on this instance , for them to read the updated change 
>> immediately and stop submission before
>> I wait for idle on this instance, which makes sure other good 
>> contexts of jpeg get a chance to be migrated
>> out to any available instance before reset starts, that way good jpeg 
>> contexts get a chance to continue
>> without issues while this instance is being reset and made ready to 
>> continue decoding after reset.
>
> Ok, completely NAK to this approach!
>
> First of all setting r->sched.ready to false is a completely no-go. I 
> can't remember how often I had to remove that nonsense.
>
> With the exception of the KIQ the back-ends should *never* touch that 
> stuff!
>
> Then the Linux kernel requires that ever use of smp_rmb() requires to 
> document the matching write memory barrier.
>
> Over all please completely remove that code all together. The wait for 
> idle function is *only* supposed to wait for the HW to become idle and 
> nothing else.

Okay Christian, Thank you for explaining , I will remove this.

Regards,
Sathish
>
> Regards,
> Christian.
>
>>
>> Regards,
>> Sathish
>>>
>>> Regards,
>>> Christian.
>>>
>>>
>>>> +    for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>>> +        r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>>> +        if (r->pipe == j)
>>>> +            continue;
>>>> +        ret = SOC15_WAIT_ON_RREG_OFFSET(JPEG, GET_INST(JPEG, 
>>>> ring->me),
>>>> +                        regUVD_JRBC0_UVD_JRBC_STATUS,
>>>> +                        jpeg_v4_0_3_core_reg_offset(j),
>>>> + UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK,
>>>> + UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +    }
>>>> +
>>>> +    return 0;
>>>> +}
>>>> +
>>>> +static int jpeg_v4_0_3_ring_reset(struct amdgpu_ring *ring, 
>>>> unsigned int vmid)
>>>> +{
>>>> +    struct amdgpu_device *adev = ring->adev;
>>>> +    struct amdgpu_ring *r;
>>>> +    int ret, j;
>>>> +
>>>> +    if (amdgpu_sriov_vf(adev))
>>>> +        return -EINVAL;
>>>> +
>>>> +    jpeg_v4_0_3_wait_for_idle_on_inst(ring);
>>>> +    jpeg_v4_0_3_stop_inst(ring->adev, ring->me);
>>>> +    jpeg_v4_0_3_start_inst(ring->adev, ring->me);
>>>> +    for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>>> +        r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>>> +        jpeg_v4_0_3_start_jrbc(r);
>>>> +        ret = amdgpu_ring_test_helper(r);
>>>> +        if (ret)
>>>> +            return ret;
>>>> +        r->sched.ready = true;
>>>> +    }
>>>> +    /* publish update */
>>>> +    smp_rmb();
>>>> +    dev_info(adev->dev, "Reset on %s succeeded\n", ring->sched.name);
>>>> +    return 0;
>>>> +}
>>>> +
>>>>   static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {
>>>>       .name = "jpeg_v4_0_3",
>>>>       .early_init = jpeg_v4_0_3_early_init,
>>>> @@ -1145,6 +1198,7 @@ static const struct amdgpu_ring_funcs 
>>>> jpeg_v4_0_3_dec_ring_vm_funcs = {
>>>>       .emit_wreg = jpeg_v4_0_3_dec_ring_emit_wreg,
>>>>       .emit_reg_wait = jpeg_v4_0_3_dec_ring_emit_reg_wait,
>>>>       .emit_reg_write_reg_wait = 
>>>> amdgpu_ring_emit_reg_write_reg_wait_helper,
>>>> +    .reset = jpeg_v4_0_3_ring_reset,
>>>>   };
>>>>     static void jpeg_v4_0_3_set_dec_ring_funcs(struct amdgpu_device 
>>>> *adev)
>>>
>>
>