[PATCH] drm/amdgpu: GFX9, GFX10: GRBM requires 1-cycle delay

Koenig, Christian Christian.Koenig at amd.com
Fri Oct 25 06:49:50 UTC 2019


Am 24.10.19 um 23:16 schrieb Tuikov, Luben:
> The GRBM interface is now capable of bursting
> 1-cycle op per register, a WRITE followed by
> another WRITE, or a WRITE followed by a READ--much
> faster than previous muti-cycle per
> completed-transaction interface. This causes a
> problem, whereby status registers requiring a
> read/write by hardware, have a 1-cycle delay, due
> to the register update having to go through GRBM
> interface.
>
> This patch adds this delay.
>
> A one cycle read op is added after updating the
> invalidate request and before reading the
> invalidate-ACK status.

Please completely drop all changes for GFX9 since this patch will most 
likely break SRIOV.

Additional to that please apply the workaround only to SDMA since the CP 
driven engines should handle that in firmware.

Regards,
Christian.

>
> See also commit
> 534991731cb5fa94b5519957646cf849ca10d17d.
>
> Signed-off-by: Luben Tuikov <luben.tuikov at amd.com>
> ---
>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c  | 4 ++--
>   drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c | 9 +++++++++
>   drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c  | 8 ++++++++
>   drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 2 +-
>   5 files changed, 22 insertions(+), 5 deletions(-)
>
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> index ac43b1af69e3..0042868dbd53 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> @@ -5129,7 +5129,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
>   		5 + /* COND_EXEC */
>   		7 + /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 + /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -5182,7 +5182,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v10_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v10_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v10_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v10_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> index 9fe95e7693d5..9a7a717208de 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v9_0.c
> @@ -6218,7 +6218,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_gfx = {
>   		5 +  /* COND_EXEC */
>   		7 +  /* PIPELINE_SYNC */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* VM_FLUSH */
>   		8 +  /* FENCE for VM_FLUSH */
>   		20 + /* GDS switch */
> @@ -6271,7 +6271,7 @@ static const struct amdgpu_ring_funcs gfx_v9_0_ring_funcs_compute = {
>   		5 + /* hdp invalidate */
>   		7 + /* gfx_v9_0_ring_emit_pipeline_sync */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 * 2 +
>   		2 + /* gfx_v9_0_ring_emit_vm_flush */
>   		8 + 8 + 8, /* gfx_v9_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size =	7, /* gfx_v9_0_ring_emit_ib_compute */
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> index 6e1b25bd1fe7..100d526e9a42 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v10_0.c
> @@ -346,6 +346,15 @@ static uint64_t gmc_v10_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   
>   	amdgpu_ring_emit_wreg(ring, hub->vm_inv_eng0_req + eng, req);
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_SDMA ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	/* wait for the invalidate to complete */
>   	amdgpu_ring_emit_reg_wait(ring, hub->vm_inv_eng0_ack + eng,
>   				  1 << vmid, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> index 9f2a893871ec..8f3097e45299 100644
> --- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
> @@ -495,6 +495,14 @@ static uint64_t gmc_v9_0_emit_flush_gpu_tlb(struct amdgpu_ring *ring,
>   	amdgpu_ring_emit_wreg(ring, hub->ctx0_ptb_addr_hi32 + (2 * vmid),
>   			      upper_32_bits(pd_addr));
>   
> +	/* Insert a dummy read to delay one cycle before the ACK
> +	 * inquiry.
> +	 */
> +	if (ring->funcs->type == AMDGPU_RING_TYPE_GFX  ||
> +	    ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE)
> +		amdgpu_ring_emit_reg_wait(ring,
> +					  hub->vm_inv_eng0_req + eng, 0, 0);
> +
>   	amdgpu_ring_emit_reg_write_reg_wait(ring, hub->vm_inv_eng0_req + eng,
>   					    hub->vm_inv_eng0_ack + eng,
>   					    req, 1 << vmid);
> diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> index b8fdb192f6d6..0c41b4fdc58b 100644
> --- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> +++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
> @@ -1588,7 +1588,7 @@ static const struct amdgpu_ring_funcs sdma_v5_0_ring_funcs = {
>   		6 + /* sdma_v5_0_ring_emit_pipeline_sync */
>   		/* sdma_v5_0_ring_emit_vm_flush */
>   		SOC15_FLUSH_GPU_TLB_NUM_WREG * 3 +
> -		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 +
> +		SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 6 * 2 +
>   		10 + 10 + 10, /* sdma_v5_0_ring_emit_fence x3 for user fence, vm fence */
>   	.emit_ib_size = 7 + 6, /* sdma_v5_0_ring_emit_ib */
>   	.emit_ib = sdma_v5_0_ring_emit_ib,



More information about the amd-gfx mailing list