[PATCH] drm/amdgpu: optimize the padding with hw optimization

Thu Aug 1 03:22:43 UTC 2024

On Wed, Jul 31, 2024 at 11:19 PM Marek Olšák <maraeo at gmail.com> wrote:
>
> On Tue, Jul 30, 2024 at 8:43 AM Sunil Khatri <sunil.khatri at amd.com> wrote:
> >
> > Adding NOP packets one by one in the ring
> > does not use the CP efficiently.
> >
> > Solution:
> > Use CP optimization while adding NOP packet's so PFP
> > can discard NOP packets based on information of count
> > from the Header instead of fetching all NOP packets
> > one by one.
> >
> > Cc: Christian König <christian.koenig at amd.com>
> > Cc: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
> > Cc: Tvrtko Ursulin <tursulin at igalia.com>
> > Cc: Marek Olšák <marek.olsak at amd.com>
> > Signed-off-by: Sunil Khatri <sunil.khatri at amd.com>
> > ---
> >  drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c | 24 +++++++++++++++++++++---
> >  1 file changed, 21 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > index 853084a2ce7f..edf5b5c4d185 100644
> > --- a/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > +++ b/drivers/gpu/drm/amd/amdgpu/gfx_v10_0.c
> > @@ -9397,6 +9397,24 @@ static void gfx_v10_0_emit_mem_sync(struct amdgpu_ring *ring)
> >         amdgpu_ring_write(ring, gcr_cntl); /* GCR_CNTL */
> >  }
> >
> > +static void amdgpu_gfx10_ring_insert_nop(struct amdgpu_ring *ring, uint32_t num_nop)
> > +{
> > +       int i;
> > +
> > +       /* Header itself is a NOP packet */
> > +       if (num_nop == 1) {
> > +               amdgpu_ring_write(ring, ring->funcs->nop);
> > +               return;
> > +       }
> > +
> > +       /* Max HW optimization till 0x3ffe, followed by remaining one NOP at a time*/
> > +       amdgpu_ring_write(ring, PACKET3(PACKET3_NOP, min(num_nop - 2, 0x3ffe)));
> > +
> > +       /* Header is at index 0, followed by num_nops - 1 NOP packet's */
> > +       for (i = 1; i < num_nop; i++)
> > +               amdgpu_ring_write(ring, ring->funcs->nop);
>
> This loop should be removed. It's unnecessary CPU overhead and we
> should never get more than 0x3fff NOPs (maybe use BUG_ON). Leaving the
> whole packet body uninitialized is the fastest option.

If you remove amdgpu_ring_write, you still need to move wptr somehow.
amdgpu_ring_write_multiple gives a hint about how to do it:

ring->wptr += count_dw;
ring->wptr &= ring->ptr_mask;
ring->count_dw -= count_dw;

Marek

>
> Marek
>
> > +}
> > +
> >  static void gfx_v10_ip_print(void *handle, struct drm_printer *p)
> >  {
> >         struct amdgpu_device *adev = (struct amdgpu_device *)handle;
> > @@ -9588,7 +9606,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_gfx = {
> >         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_switch_buffer = gfx_v10_0_ring_emit_sb,
> >         .emit_cntxcntl = gfx_v10_0_ring_emit_cntxcntl,
> > @@ -9629,7 +9647,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_compute = {
> >         .emit_hdp_flush = gfx_v10_0_ring_emit_hdp_flush,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> >         .emit_reg_wait = gfx_v10_0_ring_emit_reg_wait,
> > @@ -9659,7 +9677,7 @@ static const struct amdgpu_ring_funcs gfx_v10_0_ring_funcs_kiq = {
> >         .emit_fence = gfx_v10_0_ring_emit_fence_kiq,
> >         .test_ring = gfx_v10_0_ring_test_ring,
> >         .test_ib = gfx_v10_0_ring_test_ib,
> > -       .insert_nop = amdgpu_ring_insert_nop,
> > +       .insert_nop = amdgpu_gfx10_ring_insert_nop,
> >         .pad_ib = amdgpu_ring_generic_pad_ib,
> >         .emit_rreg = gfx_v10_0_ring_emit_rreg,
> >         .emit_wreg = gfx_v10_0_ring_emit_wreg,
> > --
> > 2.34.1
> >