[PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Koenig, Christian Christian.Koenig at amd.com
Fri Oct 25 16:19:31 UTC 2019


Am 25.10.19 um 18:05 schrieb Alex Deucher:
> On Fri, Oct 25, 2019 at 2:49 AM Koenig, Christian
> <Christian.Koenig at amd.com> wrote:
>> Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
>>> The GRBM interface is now capable of bursting
>>> 1-cycle op per register, a WRITE followed by
>>> another WRITE, or a WRITE followed by a READ--much
>>> faster than previous muti-cycle per
>>> completed-transaction interface. This causes a
>>> problem, whereby status registers requiring a
>>> read/write by hardware, have a 1-cycle delay, due
>>> to the register update having to go through GRBM
>>> interface.
>>>
>>> This patch adds this delay.
>>>
>>> A one cycle read op is added after updating the
>>> invalidate request and before reading the
>>> invalidate-ACK status.
>> Please completely drop all changes for GFX9 since this patch will most
>> likely break SRIOV.
>>
>> Additional to that please apply the workaround only to SDMA since the CP
>> driven engines should handle that in firmware.
> I think the CP only handles this in firmware if we use the new TLB
> invalidation packet.  I don't think it applies it to general register
> writes like we do.

No, on the CP we should use the combined write/wait command even if we 
don't use the new specialized VM invalidate command. Everything else 
won't work with SRIOV.

Even if we want to we can't insert an extra read in this combined 
write/wait command. And if we split up the commands we would break SRIOV 
once more.

So applying this workaround to the CP code doesn't make any sense at all.

The only TODO which I can see is that we maybe don't use the combined 
write/wait command on Navi yet.

Christian.

>
> Alex
>
>> Regards,
>> Christian.
>>
>>> See also commit
>>> 534991731cb5fa94b5519957646cf849ca10d17d.
>>>
>>> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
>>> ---
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>>>    drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>>>    drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>>>    5 files changed, 22 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> index ac43b1af69e3..0042868dbd53 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
>>> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>>>                5 + /* COND_EXEC */
>>>                7 + /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 + /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v10_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v10_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> index 9fe95e7693d5..9a7a717208de 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
>>> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>>>                5 +  /* COND_EXEC */
>>>                7 +  /* PIPELINE_SYNC */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* VM_FLUSH */
>>>                8 +  /* FENCE for VM_FLUSH */
>>>                20 + /* GDS switch */
>>> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>>>                5 + /* hdp invalidate */
>>>                7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>>>                2 + /* gfx_v9_0_ring_emit_vm_flush */
>>>                8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7, /* gfx_v9_0_ring_emit_ib_compute */
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> index 6e1b25bd1fe7..100d526e9a42 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
>>> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>
>>>        amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        /* wait for the invalidate to complete */
>>>        amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>>>                                  1 << vmid, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> index 9f2a893871ec..8f3097e45299 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
>>> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>>>        amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>>>                              upper_32_bits(pd_addr));
>>>
>>> +     /* Insert a dummy read to delay one cycle before the ACK
>>> +      * inquiry.
>>> +      */
>>> +     if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
>>> +         ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
>>> +             amdgpu_ring_emit_reg_wait(ring,
>>> +                                       hub->vm_inv_eng0_req + eng, 0, 0);
>>> +
>>>        amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>>>                                            hub->vm_inv_eng0_ack + eng,
>>>                                            req, 1 << vmid);
>>> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> index b8fdb192f6d6..0c41b4fdc58b 100644
>>> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
>>> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>>>                6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>>>                /* sdma_v5_0_ring_emit_vm_flush */
>>>                SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
>>> -             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
>>> +             SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>>>                10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>>>        .emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>>>        .emit_ib = sdma_v5_0_ring_emit_ib,
>> _______________________________________________
>> amd-gfx mailing list
>> amd-gfx at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/amd-gfx



More information about the amd-gfx mailing list