[PATCH v4 2/3] drm/amdgpu: More efficient ring padding
Tvrtko Ursulin
tursulin at igalia.com
Mon Jul 15 10:40:26 UTC 2024
From: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
Having noticed that typically 200+ nops per submission are written into
the ring, using a rather verbose one-nop-at-a-time-plus-ring-buffer-
arithmetic as done in amdgpu_ring_write(), the obvious idea was to
improve it by filling those nops in blocks.
This patch therefore adds the amdgpu_ring_fill() helper which can do the
business in at most two memset32 calls.
The performance gains are not that amazing for normal workloads though.
For instance a game which results in two submissions per frame, each pads
with 222 nops, submission worker thread profile changes from:
+ 90.78% 0.67% kworker/u32:3-e [kernel.kallsyms] [k] process_one_work
+ 48.92% 0.12% kworker/u32:3-e [kernel.kallsyms] [k] commit_tail
+ 41.18% 1.73% kworker/u32:3-e [kernel.kallsyms] [k] amdgpu_dm_atomic_commit_tail
- 30.31% 0.67% kworker/u32:3-e [kernel.kallsyms] [k] drm_sched_run_job_work
- 29.63% drm_sched_run_job_work
+ 8.55% dma_fence_add_callback
- 7.50% amdgpu_job_run
- 7.43% amdgpu_ib_schedule
- 2.46% amdgpu_ring_commit
1.44% amdgpu_ring_insert_nop
To:
+ 88.28% 0.21% kworker/u32:4-e [kernel.kallsyms] [k] process_one_work
+ 45.22% 0.12% kworker/u32:4-e [kernel.kallsyms] [k] commit_tail
+ 37.93% 1.86% kworker/u32:4-e [kernel.kallsyms] [k] amdgpu_dm_atomic_commit_tail
- 29.28% 1.11% kworker/u32:4-e [kernel.kallsyms] [k] drm_sched_run_job_work
- 28.16% drm_sched_run_job_work
+ 9.93% dma_fence_add_callback
- 6.84% amdgpu_job_run
- 6.27% amdgpu_ib_schedule
1.23% gfx_v10_0_ring_emit_fence
1.20% amdgpu_ring_commit
But if we run a more "spammy" workload, which does several orders of
magnitude more submissions second we go from:
+ 79.38% 1.66% kworker/u32:1+e [kernel.kallsyms] [k] process_one_work
- 63.13% 6.66% kworker/u32:1+e [kernel.kallsyms] [k] drm_sched_run_job_work
- 56.47% drm_sched_run_job_work
- 25.67% amdgpu_job_run
- 24.40% amdgpu_ib_schedule
- 15.29% amdgpu_ring_commit
12.06% amdgpu_ring_insert_nop
To:
+ 77.83% 2.07% kworker/u32:2-e [kernel.kallsyms] [k] process_one_work
- 61.03% 7.75% kworker/u32:2-e [kernel.kallsyms] [k] drm_sched_run_job_work
- 53.27% drm_sched_run_job_work
- 17.31% amdgpu_job_run
- 16.34% amdgpu_ib_schedule
- 7.17% amdgpu_ring_commit
1.67% amdgpu_ring_insert_nop
Which is a noticeable improvement in time spent by amdgpu_ring_commit
inserting nops.
v2:
* Use a much more sensible "fill" approach.
v3:
* Forgot to git add.
v4:
* Fix amdgpu_ring_generic_pad_ib.
Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin at igalia.com>
Cc: Christian König <ckoenig.leichtzumerken at gmail.com>
---
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c | 19 +++++++++++++----
drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h | 27 ++++++++++++++++++++++++
drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c | 13 ++++++------
drivers/gpu/drm/amd/amdgpu/cik_sdma.c | 15 +++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c | 16 ++++++++------
drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c | 16 ++++++++------
12 files changed, 128 insertions(+), 74 deletions(-)
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
index 1a1963318424..28016e4571a1 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.c
@@ -108,9 +108,12 @@ int amdgpu_ring_alloc(struct amdgpu_ring *ring, unsigned int ndw)
*/
void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
- int i;
+ if (count == 0)
+ return;
- for (i = 0; i < count; i++)
+ if (count > 1)
+ amdgpu_ring_fill(ring, ring->funcs->nop, count);
+ else
amdgpu_ring_write(ring, ring->funcs->nop);
}
@@ -124,8 +127,16 @@ void amdgpu_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
*/
void amdgpu_ring_generic_pad_ib(struct amdgpu_ring *ring, struct amdgpu_ib *ib)
{
- while (ib->length_dw & ring->funcs->align_mask)
- ib->ptr[ib->length_dw++] = ring->funcs->nop;
+ u32 align_mask = ring->funcs->align_mask;
+ u32 count = ib->length_dw & align_mask;
+
+ if (count) {
+ count = align_mask + 1 - count;
+
+ memset32(&ib->ptr[ib->length_dw], ring->funcs->nop, count);
+
+ ib->length_dw += count;
+ }
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
index 582053f1cd56..2264d7502eb6 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_ring.h
@@ -412,6 +412,33 @@ static inline void amdgpu_ring_write_multiple(struct amdgpu_ring *ring,
ring->count_dw -= count_dw;
}
+static inline void amdgpu_ring_fill(struct amdgpu_ring *ring, u32 v,
+ unsigned int count_dw)
+{
+ unsigned int occupied, chunk1, chunk2;
+
+ if (!count_dw)
+ return;
+
+ if (unlikely(ring->count_dw < count_dw))
+ DRM_ERROR("amdgpu: writing more dwords to the ring than expected!\n");
+
+ occupied = ring->wptr & ring->buf_mask;
+ chunk1 = ring->buf_mask + 1 - occupied;
+ chunk1 = (chunk1 >= count_dw) ? count_dw : chunk1;
+ chunk2 = count_dw - chunk1;
+
+ if (chunk1)
+ memset32((u32 *)&ring->ring[occupied], v, chunk1);
+
+ if (chunk2)
+ memset32((u32 *)&ring->ring[0], v, chunk2);
+
+ ring->wptr += count_dw;
+ ring->wptr &= ring->ptr_mask;
+ ring->count_dw -= count_dw;
+}
+
/**
* amdgpu_ring_patch_cond_exec - patch dw count of conditional execute
* @ring: amdgpu_ring structure
diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
index 5acd20ff5979..61a9766b02e3 100644
--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
+++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_vpe.c
@@ -452,14 +452,13 @@ static int vpe_resume(void *handle)
static void vpe_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (i == 0)
- amdgpu_ring_write(ring, ring->funcs->nop |
- VPE_CMD_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ amdgpu_ring_write(ring, nop | VPE_CMD_NOP_HEADER_COUNT(--count));
+ amdgpu_ring_fill(ring, nop, count);
}
static uint64_t vpe_get_csa_mc_addr(struct amdgpu_ring *ring, uint32_t vmid)
diff --git a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
index 952737de9411..1b7742728bf0 100644
--- a/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
+++ b/drivers/gpu/drm/amd/amdgpu/cik_sdma.c
@@ -198,14 +198,15 @@ static void cik_sdma_ring_set_wptr(struct amdgpu_ring *ring)
static void cik_sdma_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_NOP_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring, nop | SDMA_NOP_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
index 725392522267..4061f7a4d055 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v2_4.c
@@ -222,14 +222,16 @@ static void sdma_v2_4_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v2_4_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
index aa637541da58..f655b5c32bd8 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v3_0.c
@@ -398,14 +398,16 @@ static void sdma_v3_0_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v3_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
index 772604feb6ac..bc8bb6b184af 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_0.c
@@ -737,14 +737,16 @@ static void sdma_v4_0_page_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v4_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
index 01644a869738..3425cf35c0d6 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v4_4_2.c
@@ -290,14 +290,16 @@ static void sdma_v4_4_2_page_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v4_4_2_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
index b7d33d78bce0..fa6591e9769a 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_0.c
@@ -387,14 +387,16 @@ static void sdma_v5_0_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v5_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
index cc9e961f0078..f95e2831447e 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v5_2.c
@@ -194,14 +194,16 @@ static void sdma_v5_2_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v5_2_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
index dab4c2db8c9d..aabe4dc37876 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v6_0.c
@@ -181,14 +181,16 @@ static void sdma_v6_0_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v6_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/*
diff --git a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
index 96514fd77e35..37ca21dc9678 100644
--- a/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
+++ b/drivers/gpu/drm/amd/amdgpu/sdma_v7_0.c
@@ -206,14 +206,16 @@ static void sdma_v7_0_ring_set_wptr(struct amdgpu_ring *ring)
static void sdma_v7_0_ring_insert_nop(struct amdgpu_ring *ring, uint32_t count)
{
struct amdgpu_sdma_instance *sdma = amdgpu_sdma_get_instance_from_ring(ring);
- int i;
+ u32 nop = ring->funcs->nop;
- for (i = 0; i < count; i++)
- if (sdma && sdma->burst_nop && (i == 0))
- amdgpu_ring_write(ring, ring->funcs->nop |
- SDMA_PKT_NOP_HEADER_COUNT(count - 1));
- else
- amdgpu_ring_write(ring, ring->funcs->nop);
+ if (!count)
+ return;
+
+ if (sdma && sdma->burst_nop)
+ amdgpu_ring_write(ring,
+ nop | SDMA_PKT_NOP_HEADER_COUNT(--count));
+
+ amdgpu_ring_fill(ring, nop, count);
}
/**
--
2.44.0
More information about the amd-gfx
mailing list