<html>
<head>
<meta http-equiv="Content-Type" content="text/html; charset=us-ascii">
<style type="text/css" style="display:none;"><!-- P {margin-top:0;margin-bottom:0;} --></style>
</head>
<body dir="ltr">
<div id="divtagdefaultwrapper" style="font-size:12pt;color:#000000;font-family:Calibri,Helvetica,sans-serif;" dir="ltr">
<p style="margin-top:0;margin-bottom:0">By applied patch 1, gfx IB size changes obviously.</p>
<p style="margin-top:0;margin-bottom:0"><br>
</p>
<p style="margin-top:0;margin-bottom:0"></p>
<div>openarena GFX-IB-size changes from 7.64k to 7.44k.<br>
glxgears GFX-IB-size changes from 8.48k to 8.416k.</div>
<p></p>
<p style="margin-top:0;margin-bottom:0"><br>
</p>
<p style="margin-top:0;margin-bottom:0">The adding CPU workloads are quite small in this case and it's  hard to measure it. So t<span>he assumption is that writing a SET_CONTEXT_REG packet is more expensive. And we emit too many redundant packets. Some registers
 rarely get different values.</span></p>
<p style="margin-top:0;margin-bottom:0"><br>
</p>
<p style="margin-top:0;margin-bottom:0">Thanks,</p>
<p style="margin-top:0;margin-bottom:0">Sonny<br>
</p>
</div>
<hr style="display:inline-block;width:98%" tabindex="-1">
<div id="divRplyFwdMsg" dir="ltr"><font face="Calibri, sans-serif" style="font-size:11pt" color="#000000"><b>From:</b> Dave Airlie <airlied@gmail.com><br>
<b>Sent:</b> Thursday, June 7, 2018 5:20:12 PM<br>
<b>To:</b> Jiang, Sonny<br>
<b>Cc:</b> mesa-dev<br>
<b>Subject:</b> Re: [Mesa-dev] [PATCH 1/6] radeonsi: emit_db_render_state packets optimization</font>
<div> </div>
</div>
<div class="BodyFragment"><font size="2"><span style="font-size:11pt;">
<div class="PlainText">On 8 June 2018 at 02:13, Sonny Jiang <sonny.jiang@amd.com> wrote:<br>
> Remembering latest states of registers to eliminate redunant SET_CONTEXT_REG packets<br>
><br>
<br>
Does this help anywhere btw? Numbers for an app or game.<br>
<br>
In the past I've always found this sort of optimisation pointless, the<br>
CP on the GPU side<br>
does a pretty good job at nuking pointless register sets or at least<br>
hiding them.<br>
<br>
This will increase CPU and memory usage in some ways in favour of<br>
reducing CP and<br>
possibly memory bandwidth, but without numbers I'm not sure it's a good plan.<br>
<br>
Dave.<br>
<br>
> Signed-off-by: Sonny Jiang <sonny.jiang@amd.com><br>
> ---<br>
>  src/gallium/drivers/radeonsi/si_build_pm4.h | 43 +++++++++++++++++++++<br>
>  src/gallium/drivers/radeonsi/si_gfx_cs.c    |  3 ++<br>
>  src/gallium/drivers/radeonsi/si_pipe.h      |  2 +<br>
>  src/gallium/drivers/radeonsi/si_state.c     | 60 +++++++++++++++--------------<br>
>  src/gallium/drivers/radeonsi/si_state.h     | 16 ++++++++<br>
>  5 files changed, 95 insertions(+), 29 deletions(-)<br>
><br>
> diff --git a/src/gallium/drivers/radeonsi/si_build_pm4.h b/src/gallium/drivers/radeonsi/si_build_pm4.h<br>
> index 22f5558..45d943f 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_build_pm4.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_build_pm4.h<br>
> @@ -110,4 +110,47 @@ static inline void radeon_set_uconfig_reg_idx(struct radeon_winsys_cs *cs,<br>
>         radeon_emit(cs, value);<br>
>  }<br>
><br>
> +/* Emit PKT3_SET_CONTEXT_REG if the register value is different. */<br>
> +static inline void radeon_opt_set_context_reg(struct si_context *sctx, unsigned offset,<br>
> +                                             enum si_tracked_reg reg, unsigned value)<br>
> +{<br>
> +       struct radeon_winsys_cs *cs = sctx->gfx_cs;<br>
> +<br>
> +       if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||<br>
> +           sctx->tracked_regs.reg_value[reg] != value ) {<br>
> +<br>
> +               radeon_set_context_reg(cs, offset, value);<br>
> +<br>
> +               sctx->tracked_regs.reg_saved |= 1 << reg;<br>
> +               sctx->tracked_regs.reg_value[reg] = value;<br>
> +       }<br>
> +}<br>
> +<br>
> +/**<br>
> + * Set 2 consecutive registers if any registers value is different.<br>
> + * @param offset        starting register offset<br>
> + * @param value1        is written to first register<br>
> + * @param value2        is written to second register<br>
> + */<br>
> +static inline void radeon_opt_set_context_reg2(struct si_context *sctx, unsigned offset,<br>
> +                                              enum si_tracked_reg reg, unsigned value1,<br>
> +                                              unsigned value2)<br>
> +{<br>
> +       struct radeon_winsys_cs *cs = sctx->gfx_cs;<br>
> +<br>
> +       if (!(sctx->tracked_regs.reg_saved & (1 << reg)) ||<br>
> +           !(sctx->tracked_regs.reg_saved & (1 << (reg + 1))) ||<br>
> +           sctx->tracked_regs.reg_value[reg] != value1 ||<br>
> +           sctx->tracked_regs.reg_value[reg+1] != value2 ) {<br>
> +<br>
> +               radeon_set_context_reg_seq(cs, offset, 2);<br>
> +               radeon_emit(cs, value1);<br>
> +               radeon_emit(cs, value2);<br>
> +<br>
> +               sctx->tracked_regs.reg_value[reg] = value1;<br>
> +               sctx->tracked_regs.reg_value[reg+1] = value2;<br>
> +               sctx->tracked_regs.reg_saved |= 3 << reg;<br>
> +       }<br>
> +}<br>
> +<br>
>  #endif<br>
> diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c<br>
> index ec74c1b..0b9a020 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_gfx_cs.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c<br>
> @@ -321,4 +321,7 @@ void si_begin_new_gfx_cs(struct si_context *ctx)<br>
>         ctx->last_num_tcs_input_cp = -1;<br>
><br>
>         ctx->cs_shader_state.initialized = false;<br>
> +<br>
> +       /* Set all saved registers state to unknown */<br>
> +       ctx->tracked_regs.reg_saved = 0;<br>
>  }<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index 5d1671f..82a8263 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -1033,6 +1033,8 @@ struct si_context {<br>
><br>
>         void (*dma_clear_buffer)(struct si_context *sctx, struct pipe_resource *dst,<br>
>                                  uint64_t offset, uint64_t size, unsigned value);<br>
> +<br>
> +       struct si_tracked_regs                  tracked_regs;<br>
>  };<br>
><br>
>  /* cik_sdma.c */<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c<br>
> index 3a7e928..c95b929 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state.c<br>
> @@ -1343,28 +1343,25 @@ void si_save_qbo_state(struct si_context *sctx, struct si_qbo_state *st)<br>
><br>
>  static void si_emit_db_render_state(struct si_context *sctx)<br>
>  {<br>
> -       struct radeon_winsys_cs *cs = sctx->gfx_cs;<br>
>         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;<br>
> -       unsigned db_shader_control;<br>
> -<br>
> -       radeon_set_context_reg_seq(cs, R_028000_DB_RENDER_CONTROL, 2);<br>
> +       unsigned db_shader_control, db_render_control, db_count_control;<br>
><br>
>         /* DB_RENDER_CONTROL */<br>
>         if (sctx->dbcb_depth_copy_enabled ||<br>
>             sctx->dbcb_stencil_copy_enabled) {<br>
> -               radeon_emit(cs,<br>
> -                           S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |<br>
> -                           S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |<br>
> -                           S_028000_COPY_CENTROID(1) |<br>
> -                           S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample));<br>
> +               db_render_control =<br>
> +                       S_028000_DEPTH_COPY(sctx->dbcb_depth_copy_enabled) |<br>
> +                       S_028000_STENCIL_COPY(sctx->dbcb_stencil_copy_enabled) |<br>
> +                       S_028000_COPY_CENTROID(1) |<br>
> +                       S_028000_COPY_SAMPLE(sctx->dbcb_copy_sample);<br>
>         } else if (sctx->db_flush_depth_inplace || sctx->db_flush_stencil_inplace) {<br>
> -               radeon_emit(cs,<br>
> -                           S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |<br>
> -                           S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace));<br>
> +               db_render_control =<br>
> +                       S_028000_DEPTH_COMPRESS_DISABLE(sctx->db_flush_depth_inplace) |<br>
> +                       S_028000_STENCIL_COMPRESS_DISABLE(sctx->db_flush_stencil_inplace);<br>
>         } else {<br>
> -               radeon_emit(cs,<br>
> -                           S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |<br>
> -                           S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear));<br>
> +               db_render_control =<br>
> +                       S_028000_DEPTH_CLEAR_ENABLE(sctx->db_depth_clear) |<br>
> +                       S_028000_STENCIL_CLEAR_ENABLE(sctx->db_stencil_clear);<br>
>         }<br>
><br>
>         /* DB_COUNT_CONTROL (occlusion queries) */<br>
> @@ -1373,28 +1370,33 @@ static void si_emit_db_render_state(struct si_context *sctx)<br>
>                 bool perfect = sctx->num_perfect_occlusion_queries > 0;<br>
><br>
>                 if (sctx->chip_class >= CIK) {<br>
> -                       radeon_emit(cs,<br>
> -                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |<br>
> -                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |<br>
> -                                   S_028004_ZPASS_ENABLE(1) |<br>
> -                                   S_028004_SLICE_EVEN_ENABLE(1) |<br>
> -                                   S_028004_SLICE_ODD_ENABLE(1));<br>
> +                       db_count_control =<br>
> +                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |<br>
> +                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples) |<br>
> +                               S_028004_ZPASS_ENABLE(1) |<br>
> +                               S_028004_SLICE_EVEN_ENABLE(1) |<br>
> +                               S_028004_SLICE_ODD_ENABLE(1);<br>
>                 } else {<br>
> -                       radeon_emit(cs,<br>
> -                                   S_028004_PERFECT_ZPASS_COUNTS(perfect) |<br>
> -                                   S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples));<br>
> +                       db_count_control =<br>
> +                               S_028004_PERFECT_ZPASS_COUNTS(perfect) |<br>
> +                               S_028004_SAMPLE_RATE(sctx->framebuffer.log_samples);<br>
>                 }<br>
>         } else {<br>
>                 /* Disable occlusion queries. */<br>
>                 if (sctx->chip_class >= CIK) {<br>
> -                       radeon_emit(cs, 0);<br>
> +                       db_count_control = 0;<br>
>                 } else {<br>
> -                       radeon_emit(cs, S_028004_ZPASS_INCREMENT_DISABLE(1));<br>
> +                       db_count_control = S_028004_ZPASS_INCREMENT_DISABLE(1);<br>
>                 }<br>
>         }<br>
><br>
> +       radeon_opt_set_context_reg2(sctx, R_028000_DB_RENDER_CONTROL,<br>
> +                                   SI_TRACKED_DB_RENDER_CONTROL, db_render_control,<br>
> +                                   db_count_control);<br>
> +<br>
>         /* DB_RENDER_OVERRIDE2 */<br>
> -       radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2,<br>
> +       radeon_opt_set_context_reg(sctx,  R_028010_DB_RENDER_OVERRIDE2,<br>
> +               SI_TRACKED_DB_RENDER_OVERRIDE2,<br>
>                 S_028010_DISABLE_ZMASK_EXPCLEAR_OPTIMIZATION(sctx->db_depth_disable_expclear) |<br>
>                 S_028010_DISABLE_SMEM_EXPCLEAR_OPTIMIZATION(sctx->db_stencil_disable_expclear) |<br>
>                 S_028010_DECOMPRESS_Z_ON_FLUSH(sctx->framebuffer.nr_samples >= 4));<br>
> @@ -1415,8 +1417,8 @@ static void si_emit_db_render_state(struct si_context *sctx)<br>
>             !sctx->screen->rbplus_allowed)<br>
>                 db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);<br>
><br>
> -       radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,<br>
> -                              db_shader_control);<br>
> +       radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,<br>
> +                                  SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);<br>
>  }<br>
><br>
>  /*<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h<br>
> index d235f31..fb5f721 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_state.h<br>
> @@ -206,6 +206,22 @@ struct si_shader_data {<br>
>         uint32_t                sh_base[SI_NUM_SHADERS];<br>
>  };<br>
><br>
> +/* The list of registers whose emitted values are remembered by si_context. */<br>
> +enum si_tracked_reg {<br>
> +       SI_TRACKED_DB_RENDER_CONTROL, /* 2 consecutive registers */<br>
> +       SI_TRACKED_DB_COUNT_CONTROL,<br>
> +<br>
> +       SI_TRACKED_DB_RENDER_OVERRIDE2,<br>
> +       SI_TRACKED_DB_SHADER_CONTROL,<br>
> +<br>
> +       SI_NUM_TRACKED_REGS,<br>
> +};<br>
> +<br>
> +struct si_tracked_regs {<br>
> +       uint32_t                reg_saved;<br>
> +       uint32_t                reg_value[SI_NUM_TRACKED_REGS];<br>
> +};<br>
> +<br>
>  /* Private read-write buffer slots. */<br>
>  enum {<br>
>         SI_ES_RING_ESGS,<br>
> --<br>
> 2.7.4<br>
><br>
> _______________________________________________<br>
> mesa-dev mailing list<br>
> mesa-dev@lists.freedesktop.org<br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</div>
</span></font></div>
</body>
</html>