[Mesa-dev] [PATCH 1/6] radeonsi: emit_db_render_state packets optimization

Jiang, Sonny Sonny.Jiang at amd.com
Thu Jun 7 21:53:39 UTC 2018


By applied patch 1, gfx IB size changes obviously.


openarena GFX-IB-size changes from 7.64k to 7.44k.
glxgears GFX-IB-size changes from 8.48k to 8.416k.


The adding CPU workloads are quite small in this case and it's  hard to measure it. So the assumption is that writing a SET_CONTEXT_REG packet is more expensive. And we emit too many redundant packets. Some registers rarely get different values.


Thanks,

Sonny

________________________________
From: Dave Airlie <airlied at gmail.com>
Sent: Thursday, June 7, 2018 5:20:12 PM
To: Jiang, Sonny
Cc: mesa-dev
Subject: Re: [Mesa-dev] [PATCH 1/6] radeonsi: emit_db_render_state packets optimization

On 8 June 2018 at 02:13, Sonny Jiang <sonny.jiang at amd.com> wrote:
> Remembering latest states of registers to eliminate redunant SET_CONTEXT_REG packets
>

Does this help anywhere btw? Numbers for an app or game.

In the past I've always found this sort of optimisation pointless, the
CP on the GPU side
does a pretty good job at nuking pointless register sets or at least
hiding them.

This will increase CPU and memory usage in some ways in favour of
reducing CP and
possibly memory bandwidth, but without numbers I'm not sure it's a good plan.

Dave.

> Signed-off-by: Sonny Jiang <sonny.jiang at amd.com>
> ---
>  src/gallium/drivers/radeonsi/si_build_pm4.h | 43 +++++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_gfx_cs.c    |  3 ++
>  src/gallium/drivers/radeonsi/si_pipe.h      |  2 +
>  src/gallium/drivers/radeonsi/si_state.c     | 60 +++++++++++++++--------------
>  src/gallium/drivers/radeonsi/si_state.h     | 16 ++++++++
>  5 files changed, 95 insertions(+), 29 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h
> index 22f5558..45d943f 100644
> --- a/src/gallium/drivers/radeonsi/si_build_pm4.h
> +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h
> @@ -110,4 +110,47 @@ static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs,
>         radeon_emit(cs, value);
>  }
>
> +/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */
> +static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,
> +                                             enum si_tracked_reg reg, unsigned value)
> +{
> +       struct radeon_winsys_cs *cs = sctx->gfx_cs;
> +
> +       if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
> +           sctx->tracked_regs.reg_value[reg] != value ) {
> +
> +               radeon_set_context_reg(cs, offset, value);
> +
> +               sctx->tracked_regs.reg_saved |= 1 << reg;
> +               sctx->tracked_regs.reg_value[reg] = value;
> +       }
> +}
> +
> +/**
> + * Set 2 consecutive registers if any registers value is different.
> + * @param offset        starting register offset
> + * @param value1        is written to first register
> + * @param value2        is written to second register
> + */
> +static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,
> +                                              enum si_tracked_reg reg, unsigned value1,
> +                                              unsigned value2)
> +{
> +       struct radeon_winsys_cs *cs = sctx->gfx_cs;
> +
> +       if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||
> +           !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) ||
> +           sctx->tracked_regs.reg_value[reg] != value1 ||
> +           sctx->tracked_regs.reg_value[reg+1] != value2 ) {
> +
> +               radeon_set_context_reg_seq(cs, offset, 2);
> +               radeon_emit(cs, value1);
> +               radeon_emit(cs, value2);
> +
> +               sctx->tracked_regs.reg_value[reg] = value1;
> +               sctx->tracked_regs.reg_value[reg+1] = value2;
> +               sctx->tracked_regs.reg_saved |= 3 << reg;
> +       }
> +}
> +
>  #endif
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> index ec74c1b..0b9a020 100644
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
> @@ -321,4 +321,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx)
>         ctx->last_num_tcs_input_cp = -1;
>
>         ctx->cs_shader_state.initialized = false;
> +
> +       /* Set all saved registers state to unknown */
> +       ctx->tracked_regs.reg_saved = 0;
>  }
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index 5d1671f..82a8263 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -1033,6 +1033,8 @@ struct si_context {
>
>         void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst,
>                                  uint64_t offset, uint64_t size, unsigned value);
> +
> +       struct si_tracked_regs                  tracked_regs;
>  };
>
>  /* cik_sdma.c */
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index 3a7e928..c95b929 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -1343,28 +1343,25 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)
>
>  static void si_emit_db_render_state(struct si_context *sctx)
>  {
> -       struct radeon_winsys_cs *cs = sctx->gfx_cs;
>         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
> -       unsigned db_shader_control;
> -
> -       radeon_set_context_reg_seq(cs, R_028000_DB_RENDER_CONTROL, 2);
> +       unsigned db_shader_control, db_render_control, db_count_control;
>
>         /* DB_RENDER_CONTROL */
>         if (sctx->dbcb_depth_copy_enabled ||
>             sctx->dbcb_stencil_copy_enabled) {
> -               radeon_emit(cs,
> -                           S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
> -                           S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
> -                           S_028000_COPY_CENTROID(1) |
> -                           S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample));
> +               db_render_control =
> +                       S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |
> +                       S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |
> +                       S_028000_COPY_CENTROID(1) |
> +                       S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);
>         } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {
> -               radeon_emit(cs,
> -                           S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
> -                           S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace));
> +               db_render_control =
> +                       S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |
> +                       S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);
>         } else {
> -               radeon_emit(cs,
> -                           S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
> -                           S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear));
> +               db_render_control =
> +                       S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |
> +                       S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);
>         }
>
>         /* DB_COUNT_CONTROL (occlusion queries) */
> @@ -1373,28 +1370,33 @@ static void si_emit_db_render_state(struct si_context *sctx)
>                 bool perfect = sctx->num_perfect_occlusion_queries > 0;
>
>                 if (sctx->chip_class >= CIK) {
> -                       radeon_emit(cs,
> -                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |
> -                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
> -                                   S_028004_ZPASS_ENABLE(1) |
> -                                   S_028004_SLICE_EVEN_ENABLE(1) |
> -                                   S_028004_SLICE_ODD_ENABLE(1));
> +                       db_count_control =
> +                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
> +                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |
> +                               S_028004_ZPASS_ENABLE(1) |
> +                               S_028004_SLICE_EVEN_ENABLE(1) |
> +                               S_028004_SLICE_ODD_ENABLE(1);
>                 } else {
> -                       radeon_emit(cs,
> -                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |
> -                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples));
> +                       db_count_control =
> +                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |
> +                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);
>                 }
>         } else {
>                 /* Disable occlusion queries. */
>                 if (sctx->chip_class >= CIK) {
> -                       radeon_emit(cs, 0);
> +                       db_count_control = 0;
>                 } else {
> -                       radeon_emit(cs, S_028004_ZPASS_INCREMENT_DISABLE(1));
> +                       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);
>                 }
>         }
>
> +       radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,
> +                                   SI_TRACKED_DB_RENDER_CONTROL, db_render_control,
> +                                   db_count_control);
> +
>         /* DB_RENDER_OVERRIDE2 */
> -       radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,
> +       radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,
> +               SI_TRACKED_DB_RENDER_OVERRIDE2,
>                 S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |
>                 S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |
>                 S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));
> @@ -1415,8 +1417,8 @@ static void si_emit_db_render_state(struct si_context *sctx)
>             !sctx->screen->rbplus_allowed)
>                 db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
>
> -       radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,
> -                              db_shader_control);
> +       radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
> +                                  SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
>  }
>
>  /*
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index d235f31..fb5f721 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -206,6 +206,22 @@ struct si_shader_data {
>         uint32_t                sh_base[SI_NUM_SHADERS];
>  };
>
> +/* The list of registers whose emitted values are remembered by si_context. */
> +enum si_tracked_reg {
> +       SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */
> +       SI_TRACKED_DB_COUNT_CONTROL,
> +
> +       SI_TRACKED_DB_RENDER_OVERRIDE2,
> +       SI_TRACKED_DB_SHADER_CONTROL,
> +
> +       SI_NUM_TRACKED_REGS,
> +};
> +
> +struct si_tracked_regs {
> +       uint32_t                reg_saved;
> +       uint32_t                reg_value[SI_NUM_TRACKED_REGS];
> +};
> +
>  /* Private read-write buffer slots. */
>  enum {
>         SI_ES_RING_ESGS,
> --
> 2.7.4
>
> _______________________________________________
> mesa-dev mailing list
> mesa-dev at lists.freedesktop.org
> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20180607/175a8999/attachment-0001.html>


More information about the mesa-dev mailing list