[PATCH] drm/amdgpu:guarantee 128dws between vm flush and IB(v3)

Wed Jan 18 11:42:19 UTC 2017

previously we always insert 128nops behind vm_flush, which
may lead to DAMframe size above 256 dw and automatially aligned
to 512 dw.

now we calculate how many DWs already inserted after vm_flush
and make up for the reset to pad up to 128dws before emit_ib.
that way we only take 256 dw per submit.

v2:
drop insert_nops in vm_flush
re-calculate the estimate frame size for gfx8 ring
v3:
calculate the gap between vm_flush and IB in cntx_cntl
on an member field of ring structure
v4:
set last_vm_flush_pos even for case of no vm flush.

Change-Id: I0a1845de07c0b86ac1b77ac1a007e893144144fd
Signed-off-by: Monk Liu <Monk.Liu at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h |  1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c   |  7 +++++++
 drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c    | 16 ++++++++++++----
 3 files changed, 20 insertions(+), 4 deletions(-)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index c813cbe..332969f 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -173,6 +173,7 @@ struct amdgpu_ring {
 #if defined(CONFIG_DEBUG_FS)
 	struct dentry *ent;
 #endif
+	u32 last_vm_flush_pos;
 };
 
 int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned ndw);
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
index d05546e..53fc5e0 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vm.c
@@ -396,6 +396,13 @@ int amdgpu_vm_flush(struct amdgpu_ring *ring, struct amdgpu_job *job)
 	    amdgpu_vm_ring_has_compute_vm_bug(ring)))
 		amdgpu_ring_emit_pipeline_sync(ring);
 
+	/* when no vm-flush this frame, we still need to mark down
+	 * the position of the tail of hdp_flush, because we still
+	 * need to make sure there are 128 DWs between last SWITCH_BUFFER and
+	 * the emit_ib this frame. otherwise there is still VM fault occured on
+	 * constant engine.
+	 */
+	ring->last_vm_flush_pos = ring->wptr;
 	if (ring->funcs->emit_vm_flush && (job->vm_needs_flush ||
 	    amdgpu_vm_is_gpu_reset(adev, id))) {
 		struct fence *fence;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
index 5f37313..dde4177 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
@@ -6572,7 +6572,6 @@ static void gfx_v8_0_ring_emit_fence_gfx(struct amdgpu_ring *ring, u64 addr,
 			  DATA_SEL(write64bit ? 2 : 1) | INT_SEL(int_sel ? 2 : 0));
 	amdgpu_ring_write(ring, lower_32_bits(seq));
 	amdgpu_ring_write(ring, upper_32_bits(seq));
-
 }
 
 static void gfx_v8_0_ring_emit_pipeline_sync(struct amdgpu_ring *ring)
@@ -6636,9 +6635,8 @@ static void gfx_v8_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
 		/* sync PFP to ME, otherwise we might get invalid PFP reads */
 		amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
 		amdgpu_ring_write(ring, 0x0);
-		/* GFX8 emits 128 dw nop to prevent CE access VM before vm_flush finish */
-		amdgpu_ring_insert_nop(ring, 128);
 	}
+	ring->last_vm_flush_pos = ring->wptr;
 }
 
 static u32 gfx_v8_0_ring_get_wptr_compute(struct amdgpu_ring *ring)
@@ -6743,6 +6741,15 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
 	if (amdgpu_sriov_vf(ring->adev))
 		gfx_v8_0_ring_emit_de_meta_init(ring,
 			(flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
+
+	/* We need to pad some NOPs before emit_ib to prevent CE run ahead of
+	 * vm_flush, which may trigger VM fault. */
+	if (ring->wptr > ring->last_vm_flush_pos) /* no wptr wrapping to RB head */
+		amdgpu_ring_insert_nop(ring, 128 - (ring->wptr - ring->last_vm_flush_pos));
+	else
+		if (ring->ptr_mask + 1 - ring->last_vm_flush_pos + ring->wptr < 128)
+			amdgpu_ring_insert_nop(ring,
+				128 - (ring->ptr_mask + 1 - ring->last_vm_flush_pos + ring->wptr));
 }
 
 static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
@@ -7018,7 +7025,8 @@ static const struct amdgpu_ring_funcs gfx_v8_0_ring_funcs_gfx = {
 		7 + /* gfx_v8_0_ring_emit_pipeline_sync */
 		128 + 19 + /* gfx_v8_0_ring_emit_vm_flush */
 		2 + /* gfx_v8_ring_emit_sb */
-		3 + 4 + 29, /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */
+		3 + 4 + 29 - /* gfx_v8_ring_emit_cntxcntl including vgt flush/meta-data */
+		20 - 7 - 6 - 3 - 4 - 29, /* no need to count gds/hdp_flush/vm_flush fence/cntx_cntl/vgt_flush/meta-data anymore */
 	.emit_ib_size =	4, /* gfx_v8_0_ring_emit_ib_gfx */
 	.emit_ib = gfx_v8_0_ring_emit_ib_gfx,
 	.emit_fence = gfx_v8_0_ring_emit_fence_gfx,
-- 
2.7.4