[PATCH] drm/amdgpu: optimize the padding with hw optimization

Thu Aug 1 06:53:23 UTC 2024

On Thu, Aug 1, 2024, 00:28 Khatri, Sunil <sukhatri at amd.com> wrote:

>
> On 8/1/2024 8:49 AM, Marek Olšák wrote:
> > On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri at amd.com>
> wrote:
> >> Adding NOP packets one by one in the ring
> >> does not use the CP efficiently.
> >>
> >> Solution:
> >> Use CP optimization while adding NOP packet's so PFP
> >> can discard NOP packets based on information of count
> >> from the Header instead of fetching all NOP packets
> >> one by one.
> >>
> >> Cc: Christian König <christian.koenig at amd.com>
> >> Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
> >> Cc: Tvrtko Ursulin <tursulin at igalia.com>
> >> Cc: Marek Olšák <marek.olsak at amd.com>
> >> Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> >> ---
> >>   drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
> >>   1 file changed, 21 insertions(+), 3 deletions(-)
> >>
> >> diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> index 853084a2ce7f..edf5b5c4d185 100644
> >> --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> >> @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct
> amdgpu_ring *ring)
> >>          amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
> >>   }
> >>
> >> +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring,
> uint32_t num_nop)
> >> +{
> >> +       int i;
> >> +
> >> +       /* Header itself is a NOP packet */
> >> +       if (num_nop == 1) {
> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
> >> +               return;
> >> +       }
> >> +
> >> +       /* Max HW optimization till 0x3ffe, followed by remaining one
> NOP at a time*/
> >> +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2,
> 0x3ffe)));
> >> +
> >> +       /* Header is at index 0, followed by num_nops - 1 NOP packet's
> */
> >> +       for (i = 1; i < num_nop; i++)
> >> +               amdgpu_ring_write(ring, ring->funcs->nop);
> > This loop should be removed. It's unnecessary CPU overhead and we
> > should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> > whole packet body uninitialized is the fastest option.
> That was the original intent to just move the WPTR for the no of nops
> and tried too. Based on Christian inputs we should not let the nops packet
>
> as garbage or whatever was there originally as a threat/safety measure.

It doesn't help safety. It can only be read by the GPU with kernel-level
permissions.

Initializing the packet body is useless and adds CPU overhead, especially
with the 256 NOPs or so that we use for no reason.

Marek

> So CPU side there isnt any optimization but just CP will skip all these
> so GPU side should see the benefit.
>
> Regards
> Sunil Khatri
>
> >
> > Marek
> >
> >> +}
> >> +
> >>   static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
> >>   {
> >>          struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> >> @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_gfx = {
> >>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
> >>          .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> >> @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_compute = {
> >>          .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >>          .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> >> @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs
> gfx_v10_0_ring_funcs_kiq = {
> >>          .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
> >>          .test_ring = gfx_v10_0_ring_test_ring,
> >>          .test_ib = gfx_v10_0_ring_test_ib,
> >> -       .insert_nop = amdgpu_ring_insert_nop,
> >> +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >>          .pad_ib = amdgpu_ring_generic_pad_ib,
> >>          .emit_rreg = gfx_v10_0_ring_emit_rreg,
> >>          .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >> --
> >> 2.34.1
> >>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/amd-gfx/attachments/20240801/745f944c/attachment-0001.htm>