[PATCH 2/2] drm/amdgpu: workaround to avoid SET_Q_MODE packets
Christian König
ckoenig.leichtzumerken at gmail.com
Thu Feb 29 14:49:25 UTC 2024
It turned out that executing the SET_Q_MODE packet on every submission
creates to much overhead.
Implement a workaround which allows skipping the SET_Q_MODE packet if
subsequent submissions all use the same parameters.
Signed-off-by: Christian König <christian.koenig at amd.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 3 +
drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c | 104 +++++++++++++++++++----
2 files changed, 92 insertions(+), 15 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 756330767909..582053f1cd56 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -285,6 +285,9 @@ struct amdgpu_ring {
unsigned cond_exe_offs;
u64 cond_exe_gpu_addr;
volatile u32 *cond_exe_cpu_addr;
+ unsigned int set_q_mode_offs;
+ volatile u32 *set_q_mode_ptr;
+ u64 set_q_mode_token;
unsigned vm_hub;
unsigned vm_inv_eng;
struct dma_fence *vmid_wait;
diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
index 2ccbdee570cf..6e6b6eff48e2 100644
--- a/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/gfx_v11_0.c
@@ -5461,6 +5461,11 @@ static void gfx_v11_0_ring_emit_vm_flush(struct amdgpu_ring *ring,
amdgpu_ring_write(ring, PACKET3(PACKET3_PFP_SYNC_ME, 0));
amdgpu_ring_write(ring, 0x0);
}
+
+ /* Make sure that we can't skip the SET_Q_MODE packets when the VM
+ * changed in any way.
+ */
+ ring->set_q_mode_ptr = NULL;
}
static void gfx_v11_0_ring_emit_fence_kiq(struct amdgpu_ring *ring, u64 addr,
@@ -5510,16 +5515,81 @@ static void gfx_v11_0_ring_emit_cntxcntl(struct amdgpu_ring *ring,
amdgpu_ring_write(ring, 0);
}
+static unsigned gfx_v11_0_ring_emit_init_cond_exec(struct amdgpu_ring *ring,
+ uint64_t addr)
+{
+ unsigned ret;
+
+ amdgpu_ring_write(ring, PACKET3(PACKET3_COND_EXEC, 3));
+ amdgpu_ring_write(ring, lower_32_bits(addr));
+ amdgpu_ring_write(ring, upper_32_bits(addr));
+ /* discard following DWs if *cond_exec_gpu_addr==0 */
+ amdgpu_ring_write(ring, 0);
+ ret = ring->wptr & ring->buf_mask;
+ /* patch dummy value later */
+ amdgpu_ring_write(ring, 0);
+
+ return ret;
+}
+
static void gfx_v11_0_ring_emit_gfx_shadow(struct amdgpu_ring *ring,
u64 shadow_va, u64 csa_va,
u64 gds_va, bool init_shadow,
int vmid)
{
struct amdgpu_device *adev = ring->adev;
+ unsigned int offs, end;
if (!adev->gfx.cp_gfx_shadow)
return;
+ /*
+ * The logic here isn't easy to understand because we need to keep state
+ * accross multiple executions of the function as well as between the
+ * CPU and GPU. The general idea is that the newly written GPU command
+ * has a condition on the previous one and only executed if really
+ * necessary.
+ */
+
+ /*
+ * The dw in the NOP controls if the next SET_Q_MODE packet should be
+ * executed or not. Reserve 64bits just to be on the save side.
+ */
+ amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, 1));
+ offs = ring->wptr & ring->buf_mask;
+
+ /*
+ * We start with skipping the prefix SET_Q_MODE and always executing
+ * the postfix SET_Q_MODE packet. This is changed below with a
+ * WRITE_DATA command when the postfix executed.
+ */
+ amdgpu_ring_write(ring, shadow_va ? 1 : 0);
+ amdgpu_ring_write(ring, 0);
+
+ if (ring->set_q_mode_offs) {
+ uint64_t addr;
+
+ addr = amdgpu_bo_gpu_offset(ring->ring_obj);
+ addr += ring->set_q_mode_offs << 2;
+ end = gfx_v11_0_ring_emit_init_cond_exec(ring, addr);
+ }
+
+ /*
+ * When the postfix SET_Q_MODE packet executes we need to make sure that the
+ * next prefix SET_Q_MODE packet executes as well.
+ */
+ if (!shadow_va) {
+ uint64_t addr;
+
+ addr = amdgpu_bo_gpu_offset(ring->ring_obj);
+ addr += offs << 2;
+ amdgpu_ring_write(ring, PACKET3(PACKET3_WRITE_DATA, 3));
+ amdgpu_ring_write(ring, WRITE_DATA_DST_SEL(5) | WR_CONFIRM);
+ amdgpu_ring_write(ring, lower_32_bits(addr));
+ amdgpu_ring_write(ring, upper_32_bits(addr));
+ amdgpu_ring_write(ring, 0x1);
+ }
+
amdgpu_ring_write(ring, PACKET3(PACKET3_SET_Q_PREEMPTION_MODE, 7));
amdgpu_ring_write(ring, lower_32_bits(shadow_va));
amdgpu_ring_write(ring, upper_32_bits(shadow_va));
@@ -5531,23 +5601,26 @@ static void gfx_v11_0_ring_emit_gfx_shadow(struct amdgpu_ring *ring,
PACKET3_SET_Q_PREEMPTION_MODE_IB_VMID(vmid) : 0);
amdgpu_ring_write(ring, init_shadow ?
PACKET3_SET_Q_PREEMPTION_MODE_INIT_SHADOW_MEM : 0);
-}
-static unsigned gfx_v11_0_ring_emit_init_cond_exec(struct amdgpu_ring *ring,
- uint64_t addr)
-{
- unsigned ret;
+ if (ring->set_q_mode_offs)
+ amdgpu_ring_patch_cond_exec(ring, end);
- amdgpu_ring_write(ring, PACKET3(PACKET3_COND_EXEC, 3));
- amdgpu_ring_write(ring, lower_32_bits(addr));
- amdgpu_ring_write(ring, upper_32_bits(addr));
- /* discard following DWs if *cond_exec_gpu_addr==0 */
- amdgpu_ring_write(ring, 0);
- ret = ring->wptr & ring->buf_mask;
- /* patch dummy value later */
- amdgpu_ring_write(ring, 0);
+ if (shadow_va) {
+ uint64_t token = shadow_va ^ csa_va ^ gds_va ^ vmid;
- return ret;
+ /*
+ * If the tokens match try to skip the last postfix SET_Q_MODE
+ * packet to avoid saving/restoring the state all the time.
+ */
+ if (ring->set_q_mode_ptr && ring->set_q_mode_token == token)
+ *ring->set_q_mode_ptr = 0;
+
+ ring->set_q_mode_token = token;
+ } else {
+ ring->set_q_mode_ptr = &ring->ring[ring->set_q_mode_offs];
+ }
+
+ ring->set_q_mode_offs = offs;
}
static int gfx_v11_0_ring_preempt_ib(struct amdgpu_ring *ring)
@@ -6114,7 +6187,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = {
.emit_frame_size = /* totally 247 maximum if 16 IBs */
5 + /* update_spm_vmid */
5 + /* COND_EXEC */
- 9 + /* SET_Q_PREEMPTION_MODE */
+ 22 + /* SET_Q_PREEMPTION_MODE */
7 + /* PIPELINE_SYNC */
SOC15_FLUSH_GPU_TLB_NUM_WREG * 5 +
SOC15_FLUSH_GPU_TLB_NUM_REG_WAIT * 7 +
@@ -6127,6 +6200,7 @@ static const struct amdgpu_ring_funcs gfx_v11_0_ring_funcs_gfx = {
31 + /* DE_META */
3 + /* CNTX_CTRL */
5 + /* HDP_INVL */
+ 22 + /* SET_Q_PREEMPTION_MODE */
8 + 8 + /* FENCE x2 */
8, /* gfx_v11_0_emit_mem_sync */
.emit_ib_size = 4, /* gfx_v11_0_ring_emit_ib_gfx */
--
2.34.1
More information about the amd-gfx
mailing list