[PATCH] drm/amdgpu: Always emit GDS switch when GDS/GWS/OA is used

Thu Jul 20 21:25:46 UTC 2023

Hi,

On 07.07.23 10:21, Christian König wrote:
> Am 07.07.23 um 09:28 schrieb Friedrich Vock:
>> Hi Christian,
>>
>> On 07.07.23 08:56, Christian König wrote:
>>>
>>>
>>> Am 07.07.23 um 08:28 schrieb Friedrich Vock:
>>>> During gfxoff, the per-VMID GDS registers are reset and not restored
>>>> afterwards.
>>>
>>> Hui? Since when? Those registers should be part of the saved ones.
>>>
>>> Have you found that by observation?
>>
>> yes. I tested this on my RX 6700 XT and the Steam Deck (Vangogh). In the
>> bug report I linked, a test program using GWS I developed hangs because
>> of this.
>>
>> The hang occurs as soon as the kernel re-uses a VMID on which GWS was
>> already used once. In the hung state, inspecting the per-VMID GWS
>> registers shows that the values have been reset to 0.
>> The hang does not occur when gfxoff is disabled.
>>
>> Even without causing hangs, you can confirm the behaviour by doing the
>> following:
>> 1. Disable gfxoff.
>> 2. Set some GWS registers.
>> 3. Enable gfxoff and wait a bit.
>> 4. Disable gfxoff and read the registers again. The GWS registers have
>> been reset.
>>
>> I performed this test for the GDS_BASE/SIZE registers and it seems these
>> aren't affected, so it's only GWS that is buggy here.
>
> That's most like a bug in the FW then. I'm going to ask around
> internally.

Did the talks with the FW team result in anything yet? It's not that
high-priority, but it'd be nice to know if this is going to be fixed in
the firmware or if I should make a v2 (or if this isn't going to be
fixed at all).

Thanks,
Friedrich

>
>> I should probably make a v2 that combines the behaviour before this
>> patch for GDS and OA, and the patched behaviour for GWS.
>
> Yeah, that sounds like a good idea to me. But let me ping the fw teams
> first.
>
>>
>> I'm not aware of userspace using GWS (yet, I had some ideas for using it
>> in RADV which is what I've been writing these tests for),
>> so perhaps the Cc to stable can also be omitted.
>
> Depends on what the fw teams says. As far as I know GWS has never been
> used widely on Linux.
>
> Could be that they say there is a hw bug and we deprecated it for this
> generation, or it's simply not handled by the fw and the driver needs
> to take care of this (like this patch does) or whatever.
>
> Thanks for the notice,
> Christian.
>
>>
>> Thanks,
>> Friedrich
>>
>>>
>>> Thanks,
>>> Christian.
>>>
>>>
>>>>   The kernel needs to emit a GDS switch to manually update the
>>>> GWS registers in this case. Since gfxoff can happen between any two
>>>> submissions and the kernel has no way of knowing, emit the GDS switch
>>>> before every submission.
>>>>
>>>> Fixes: 56b0989e29 ("drm/amdgpu: fix GDS/GWS/OA switch handling")
>>>> Cc: stable at vger.kernel.org
>>>> Link: https://gitlab.freedesktop.org/drm/amd/-/issues/2530
>>>> Signed-off-by: Friedrich Vock <friedrich.vock at gmx.de>
>>>> ---
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c | 22 +++++++---------------
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_job.h |  1 -
>>>>   drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c  | 10 ++++++++--
>>>>   3 files changed, 15 insertions(+), 18 deletions(-)
>>>>
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> index ff1ea99292fb..de73797e9279 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ids.c
>>>> @@ -165,24 +165,17 @@ bool amdgpu_vmid_had_gpu_reset(struct
>>>> amdgpu_device *adev,
>>>>           atomic_read(&adev->gpu_reset_counter);
>>>>   }
>>>>
>>>> -/* Check if we need to switch to another set of resources */
>>>> -static bool amdgpu_vmid_gds_switch_needed(struct amdgpu_vmid *id,
>>>> -                      struct amdgpu_job *job)
>>>> -{
>>>> -    return id->gds_base != job->gds_base ||
>>>> -        id->gds_size != job->gds_size ||
>>>> -        id->gws_base != job->gws_base ||
>>>> -        id->gws_size != job->gws_size ||
>>>> -        id->oa_base != job->oa_base ||
>>>> -        id->oa_size != job->oa_size;
>>>> -}
>>>> -
>>>>   /* Check if the id is compatible with the job */
>>>>   static bool amdgpu_vmid_compatible(struct amdgpu_vmid *id,
>>>>                      struct amdgpu_job *job)
>>>>   {
>>>>       return  id->pd_gpu_addr == job->vm_pd_addr &&
>>>> -        !amdgpu_vmid_gds_switch_needed(id, job);
>>>> +        id->gds_base == job->gds_base &&
>>>> +        id->gds_size == job->gds_size &&
>>>> +        id->gws_base == job->gws_base &&
>>>> +        id->gws_size == job->gws_size &&
>>>> +        id->oa_base == job->oa_base &&
>>>> +        id->oa_size == job->oa_size;
>>>>   }
>>>>
>>>>   /**
>>>> @@ -434,7 +427,6 @@ int amdgpu_vmid_grab(struct amdgpu_vm *vm, struct
>>>> amdgpu_ring *ring,
>>>>           list_move_tail(&id->list, &id_mgr->ids_lru);
>>>>       }
>>>>
>>>> -    job->gds_switch_needed = amdgpu_vmid_gds_switch_needed(id, job);
>>>>       if (job->vm_needs_flush) {
>>>>           id->flushed_updates = amdgpu_vm_tlb_seq(vm);
>>>>           dma_fence_put(id->last_flush);
>>>> @@ -503,7 +495,7 @@ void amdgpu_vmid_free_reserved(struct
>>>> amdgpu_device *adev,
>>>>    * @vmhub: vmhub type
>>>>    * @vmid: vmid number to use
>>>>    *
>>>> - * Reset saved GDW, GWS and OA to force switch on next flush.
>>>> + * Reset saved GDS, GWS and OA data.
>>>>    */
>>>>   void amdgpu_vmid_reset(struct amdgpu_device *adev, unsigned vmhub,
>>>>                  unsigned vmid)
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> index a963a25ddd62..2898508b1ce4 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
>>>> @@ -53,7 +53,6 @@ struct amdgpu_job {
>>>>       uint32_t        preamble_status;
>>>>       uint32_t                preemption_status;
>>>>       bool                    vm_needs_flush;
>>>> -    bool            gds_switch_needed;
>>>>       bool            spm_update_needed;
>>>>       uint64_t        vm_pd_addr;
>>>>       unsigned        vmid;
>>>> diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> index 291977b93b1d..61856040cae2 100644
>>>> --- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> +++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
>>>> @@ -557,6 +557,12 @@ void amdgpu_vm_check_compute_bug(struct
>>>> amdgpu_device *adev)
>>>>       }
>>>>   }
>>>>
>>>> +/* Check if the job needs a GDS switch */
>>>> +static bool amdgpu_vm_need_gds_switch(struct amdgpu_job *job)
>>>> +{
>>>> +    return job->gds_size || job->gws_size || job->oa_size;
>>>> +}
>>>> +
>>>>   /**
>>>>    * amdgpu_vm_need_pipeline_sync - Check if pipe sync is needed for
>>>> job.
>>>>    *
>>>> @@ -579,7 +585,7 @@ bool amdgpu_vm_need_pipeline_sync(struct
>>>> amdgpu_ring *ring,
>>>>       if (job->vm_needs_flush || ring->has_compute_vm_bug)
>>>>           return true;
>>>>
>>>> -    if (ring->funcs->emit_gds_switch && job->gds_switch_needed)
>>>> +    if (ring->funcs->emit_gds_switch &&
>>>> amdgpu_vm_need_gds_switch(job))
>>>>           return true;
>>>>
>>>>       if (amdgpu_vmid_had_gpu_reset(adev, &id_mgr->ids[job->vmid]))
>>>> @@ -609,7 +615,7 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring,
>>>> struct amdgpu_job *job,
>>>>       struct amdgpu_vmid *id = &id_mgr->ids[job->vmid];
>>>>       bool spm_update_needed = job->spm_update_needed;
>>>>       bool gds_switch_needed = ring->funcs->emit_gds_switch &&
>>>> -        job->gds_switch_needed;
>>>> +        amdgpu_vm_need_gds_switch(job);
>>>>       bool vm_flush_needed = job->vm_needs_flush;
>>>>       struct dma_fence *fence = NULL;
>>>>       bool pasid_mapping_needed = false;
>>>> --
>>>> 2.41.0
>>>>
>>>
>