[PATCH 2/7] drm/amdgpu: Add ring reset callback for JPEG4_0_3
Christian König
christian.koenig at amd.com
Fri Jan 31 16:49:03 UTC 2025
Am 31.01.25 um 17:40 schrieb Sundararaju, Sathishkumar:
> Hi Christian,
>
>
> On 1/31/2025 9:56 PM, Christian König wrote:
>> Am 31.01.25 um 17:23 schrieb Sathishkumar S:
>>> Add ring reset function callback for JPEG4_0_3 to
>>> recover from job timeouts without a full gpu reset.
>>>
>>> Signed-off-by: Sathishkumar S <sathishkumar.sundararaju at amd.com>
>>> ---
>>> drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c | 60
>>> ++++++++++++++++++++++--
>>> 1 file changed, 57 insertions(+), 3 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>> b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>> index be0b3b4c8690..62d8628dccc5 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/jpeg_v4_0_3.c
>>> @@ -204,9 +204,7 @@ static int jpeg_v4_0_3_sw_init(struct
>>> amdgpu_ip_block *ip_block)
>>> if (r)
>>> return r;
>>> - /* TODO: Add queue reset mask when FW fully supports it */
>>> - adev->jpeg.supported_reset =
>>> - amdgpu_get_soft_full_reset_mask(&adev->jpeg.inst[0].ring_dec[0]);
>>> + adev->jpeg.supported_reset = AMDGPU_RESET_TYPE_PER_PIPE;
>>> r = amdgpu_jpeg_sysfs_reset_mask_init(adev);
>>> if (r)
>>> return r;
>>> @@ -231,6 +229,7 @@ static int jpeg_v4_0_3_sw_fini(struct
>>> amdgpu_ip_block *ip_block)
>>> return r;
>>> amdgpu_jpeg_sysfs_reset_mask_fini(adev);
>>> +
>>> r = amdgpu_jpeg_sw_fini(adev);
>>> return r;
>>> @@ -1099,6 +1098,60 @@ static int
>>> jpeg_v4_0_3_process_interrupt(struct amdgpu_device *adev,
>>> return 0;
>>> }
>>> +static int jpeg_v4_0_3_wait_for_idle_on_inst(struct amdgpu_ring
>>> *ring)
>>> +{
>>> + struct amdgpu_device *adev = ring->adev;
>>> + struct amdgpu_ring *r;
>>> + int ret, j;
>>> +
>>> + for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>> + r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>> + r->sched.ready = false;
>>> + }
>>> + /* publish update */
>>> + smp_rmb();
>>
>> Using smp_rmb() to publish the update is 100% incorrect.
>>
>> What exactly are you trying to do?
> On JPEG4_0_3, there are possibly multiple contexts submitting jobs to
> all cores on the affected instance,
> so I am changing sched_ready to false and using smp_rmb() to publish
> to other cores which are trying to
> submit on this instance , for them to read the updated change
> immediately and stop submission before
> I wait for idle on this instance, which makes sure other good contexts
> of jpeg get a chance to be migrated
> out to any available instance before reset starts, that way good jpeg
> contexts get a chance to continue
> without issues while this instance is being reset and made ready to
> continue decoding after reset.
Ok, completely NAK to this approach!
First of all setting r->sched.ready to false is a completely no-go. I
can't remember how often I had to remove that nonsense.
With the exception of the KIQ the back-ends should *never* touch that stuff!
Then the Linux kernel requires that ever use of smp_rmb() requires to
document the matching write memory barrier.
Over all please completely remove that code all together. The wait for
idle function is *only* supposed to wait for the HW to become idle and
nothing else.
Regards,
Christian.
>
> Regards,
> Sathish
>>
>> Regards,
>> Christian.
>>
>>
>>> + for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>> + r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>> + if (r->pipe == j)
>>> + continue;
>>> + ret = SOC15_WAIT_ON_RREG_OFFSET(JPEG, GET_INST(JPEG,
>>> ring->me),
>>> + regUVD_JRBC0_UVD_JRBC_STATUS,
>>> + jpeg_v4_0_3_core_reg_offset(j),
>>> + UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK,
>>> + UVD_JRBC0_UVD_JRBC_STATUS__RB_JOB_DONE_MASK);
>>> + if (ret)
>>> + return ret;
>>> + }
>>> +
>>> + return 0;
>>> +}
>>> +
>>> +static int jpeg_v4_0_3_ring_reset(struct amdgpu_ring *ring,
>>> unsigned int vmid)
>>> +{
>>> + struct amdgpu_device *adev = ring->adev;
>>> + struct amdgpu_ring *r;
>>> + int ret, j;
>>> +
>>> + if (amdgpu_sriov_vf(adev))
>>> + return -EINVAL;
>>> +
>>> + jpeg_v4_0_3_wait_for_idle_on_inst(ring);
>>> + jpeg_v4_0_3_stop_inst(ring->adev, ring->me);
>>> + jpeg_v4_0_3_start_inst(ring->adev, ring->me);
>>> + for (j = 0; j < adev->jpeg.num_jpeg_rings; ++j) {
>>> + r = &adev->jpeg.inst[ring->me].ring_dec[j];
>>> + jpeg_v4_0_3_start_jrbc(r);
>>> + ret = amdgpu_ring_test_helper(r);
>>> + if (ret)
>>> + return ret;
>>> + r->sched.ready = true;
>>> + }
>>> + /* publish update */
>>> + smp_rmb();
>>> + dev_info(adev->dev, "Reset on %s succeeded\n", ring->sched.name);
>>> + return 0;
>>> +}
>>> +
>>> static const struct amd_ip_funcs jpeg_v4_0_3_ip_funcs = {
>>> .name = "jpeg_v4_0_3",
>>> .early_init = jpeg_v4_0_3_early_init,
>>> @@ -1145,6 +1198,7 @@ static const struct amdgpu_ring_funcs
>>> jpeg_v4_0_3_dec_ring_vm_funcs = {
>>> .emit_wreg = jpeg_v4_0_3_dec_ring_emit_wreg,
>>> .emit_reg_wait = jpeg_v4_0_3_dec_ring_emit_reg_wait,
>>> .emit_reg_write_reg_wait =
>>> amdgpu_ring_emit_reg_write_reg_wait_helper,
>>> + .reset = jpeg_v4_0_3_ring_reset,
>>> };
>>> static void jpeg_v4_0_3_set_dec_ring_funcs(struct amdgpu_device
>>> *adev)
>>
>
More information about the amd-gfx
mailing list