[RFC] drm/amdgpu/sdma5.2: Avoid latencies caused by the powergating workaround

Fri Jul 11 12:23:38 UTC 2025

Commit
94b1e028e15c ("drm/amdgpu/sdma5.2: add begin/end_use ring callbacks")
added a workaround which disables GFXOFF for the duration of the job
submit stage (with a 100ms trailing hysteresis).

Empirically the GFXOFF disable/enable request can suffer from significant
latencies (2ms is easily seen) which are then inserted onto the
amdgpu_job_run() path, which slows down the CPU submission of ready jobs.

1)
If the premise of the GFXOFF workaround is to keep it disabled while the
SDMA engine is active, the current workaround achieves that only
partially, for submissions and jobs which take less than 100ms (the GFXOFF
re-enable hysteresis), counting from the ring write phase, up to
completion.

2)
If disabling GFXOFF affects the GFX engine too, basing the workaround
solely on the SDMA activity creates, at minimum, a needless "chatter" on
the SMU communication channel.

If 1) and 2) hold true, we can improve on the workaround by; a) only
re-enabling GFXOFF once the job had actually completed*, and b) apply the
same workaround on other rings which share the same GFXOFF powergating
domain.

With these two applied, the GFXOFF re-enable requests are avoided
altogether during persistent activity on the GFX ring and simultaneous
sporadic activity on the SDMA ring.

This has a positive effect of drastically reducing SDMA submission
latencies. For example during the Cyberpunk 2077 benchmark, they are
reduced from an average of 64us (stdev 60) to 9us (stdev 6). Or more
importantly the worst case latency, averaged to a one second window, is
reduced from 305us to 30us**.

*) For ease of implementation we put the re-enable at the job free stage,
since doing it on actual completion is problematic in terms of locking.

**) Submission latency ewma averaged (DECLARE_EWMA(latency, 6, 4)) -
Approximately 30 SDMA submissions per second, ewma average logged once
per second therefore significantly hides the worst case latency. Eg.
the real improvement in max submission latency is severely understated by
these numbers.

Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
References: 94b1e028e15c ("drm/amdgpu/sdma5.2: add begin/end_use ring callbacks")
Cc: Mario Limonciello <mario.limonciello at amd.com>
Cc: Christian König <christian.koenig at amd.com>
Cc: Alex Deucher <alexander.deucher at amd.com>
---
 drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h  | 1 +
 drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c   | 8 ++++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.c  | 7 +++++++
 drivers/gpu/drm/amd/amdgpu/amdgpu_job.h  | 2 ++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 3 +++
 drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 1 +
 drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c   | 1 +
 7 files changed, 23 insertions(+)

diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
index 08f268dab8f5..eee40f385793 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_gfx.h
@@ -475,6 +475,7 @@ struct amdgpu_gfx {
 	uint32_t 			compute_supported_reset;
 
 	/* gfx off */
+	bool				gfx_off_held;	    /* true: rings hold gfx_off */
 	bool                            gfx_off_state;      /* true: enabled, false: disabled */
 	struct mutex                    gfx_off_mutex;      /* mutex to change gfxoff state */
 	uint32_t                        gfx_off_req_count;  /* default 1, enable gfx off: dec 1, disable gfx off: add 1 */
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
index 206b70acb29a..bf9bffe40235 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ib.c
@@ -191,6 +191,14 @@ int amdgpu_ib_schedule(struct amdgpu_ring *ring, unsigned int num_ibs,
 		return r;
 	}
 
+	if (job && adev->gfx.gfx_off_held &&
+	    (ring->funcs->type == AMDGPU_RING_TYPE_GFX ||
+	     ring->funcs->type == AMDGPU_RING_TYPE_COMPUTE ||
+	     ring->funcs->type == AMDGPU_RING_TYPE_SDMA)) {
+		amdgpu_gfx_off_ctrl(adev, false);
+		job->gfx_off_held = true;
+	}
+
 	need_ctx_switch = ring->current_ctx != fence_ctx;
 	if (ring->funcs->emit_pipeline_sync && job &&
 	    ((tmp = amdgpu_sync_get_fence(&job->explicit_sync)) ||
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
index 2b58e353cca1..4cfd175ac6df 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.c
@@ -191,6 +191,7 @@ int amdgpu_job_alloc(struct amdgpu_device *adev, struct amdgpu_vm *vm,
 	if (!*job)
 		return -ENOMEM;
 
+	(*job)->adev = adev;
 	(*job)->vm = vm;
 
 	amdgpu_sync_create(&(*job)->explicit_sync);
@@ -268,6 +269,9 @@ static void amdgpu_job_free_cb(struct drm_sched_job *s_job)
 
 	amdgpu_sync_free(&job->explicit_sync);
 
+	if (job->gfx_off_held)
+		amdgpu_gfx_off_ctrl(job->adev, true);
+
 	/* only put the hw fence if has embedded fence */
 	if (!job->hw_fence.base.ops)
 		kfree(job);
@@ -301,6 +305,9 @@ void amdgpu_job_free(struct amdgpu_job *job)
 	if (job->gang_submit != &job->base.s_fence->scheduled)
 		dma_fence_put(job->gang_submit);
 
+	if (job->gfx_off_held)
+		amdgpu_gfx_off_ctrl(job->adev, true);
+
 	if (!job->hw_fence.base.ops)
 		kfree(job);
 	else
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
index 2f302266662b..d4ab832ac193 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_job.h
@@ -46,6 +46,7 @@ enum amdgpu_ib_pool_type;
 
 struct amdgpu_job {
 	struct drm_sched_job    base;
+	struct amdgpu_device	*adev;
 	struct amdgpu_vm	*vm;
 	struct amdgpu_sync	explicit_sync;
 	struct amdgpu_fence	hw_fence;
@@ -55,6 +56,7 @@ struct amdgpu_job {
 	bool                    vm_needs_flush;
 	bool			gds_switch_needed;
 	bool			spm_update_needed;
+	bool			gfx_off_held;
 	uint64_t		vm_pd_addr;
 	unsigned		vmid;
 	unsigned		pasid;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 426834806fbf..22cac94e2f2a 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -350,6 +350,9 @@ int amdgpu_ring_init(struct amdgpu_device *adev, struct amdgpu_ring *ring,
 	ring->max_dw = max_dw;
 	ring->hw_prio = hw_prio;
 
+	if (ring->funcs->gfx_off_held)
+		adev->gfx.gfx_off_held = true;
+
 	if (!ring->no_scheduler && ring->funcs->type < AMDGPU_HW_IP_NUM) {
 		hw_ip = ring->funcs->type;
 		num_sched = &adev->gpu_sched[hw_ip][hw_prio].num_scheds;
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 784ba2ec354c..afaf951b0b78 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -202,6 +202,7 @@ struct amdgpu_ring_funcs {
 	bool			support_64bit_ptrs;
 	bool			no_user_fence;
 	bool			secure_submission_supported;
+	bool			gfx_off_held;
 	unsigned		extra_dw;
 
 	/* ring read/write ptr handling */
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index 42a25150f83a..c88de65e82bc 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -1944,6 +1944,7 @@ static const struct amdgpu_ring_funcs sdma_v5_2_ring_funcs = {
 	.nop = SDMA_PKT_NOP_HEADER_OP(SDMA_OP_NOP),
 	.support_64bit_ptrs = true,
 	.secure_submission_supported = true,
+	.gfx_off_held = true,
 	.get_rptr = sdma_v5_2_ring_get_rptr,
 	.get_wptr = sdma_v5_2_ring_get_wptr,
 	.set_wptr = sdma_v5_2_ring_set_wptr,
-- 
2.48.0