[Mesa-dev] [PATCH] radv: avoid context rolls when binding graphics pipelines

Bas Nieuwenhuizen bas at basnieuwenhuizen.nl
Wed Jan 16 00:24:10 UTC 2019


On Mon, Jan 14, 2019 at 5:12 PM Rhys Perry <pendingchaos02 at gmail.com> wrote:
>
> I did and found small improvements in Rise of the Tomb Raider. I
> measured framerates ~104.3% that of without the changes for the
> Geothermal Valley scene, ~101.2% for Spine of the Mountain and ~102.3%
> for Prophets Tomb.

My main question would be what the statistical significance is.  e.g.
did you do one run of each, did you do multiple, and what was your
test setup?

Just curious because I have tried the exact same thing before and
could not find anything more than noise.

>
> I found no change with Dota 2 but I've heard it's cpu-bound.
>
> On Mon, 14 Jan 2019 at 16:05, Samuel Pitoiset <samuel.pitoiset at gmail.com> wrote:
> >
> > Did you benchmark?
> >
> > On 1/14/19 5:01 PM, Rhys Perry wrote:
> > > It's common in some applications to bind a new graphics pipeline without
> > > ending up changing any context registers.
> > >
> > > This has a pipline have two command buffers: one for setting context
> > > registers and one for everything else. The context register command buffer
> > > is only emitted if it differs from the previous pipeline's.
> > >
> > > Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
> > > ---
> > >   src/amd/vulkan/radv_cmd_buffer.c |  46 +++++--
> > >   src/amd/vulkan/radv_pipeline.c   | 217 ++++++++++++++++---------------
> > >   src/amd/vulkan/radv_private.h    |   2 +
> > >   3 files changed, 150 insertions(+), 115 deletions(-)
> > >
> > > diff --git a/src/amd/vulkan/radv_cmd_buffer.c b/src/amd/vulkan/radv_cmd_buffer.c
> > > index f41d6c0b3e7..59903ab64d8 100644
> > > --- a/src/amd/vulkan/radv_cmd_buffer.c
> > > +++ b/src/amd/vulkan/radv_cmd_buffer.c
> > > @@ -634,7 +634,7 @@ radv_emit_descriptor_pointers(struct radv_cmd_buffer *cmd_buffer,
> > >       }
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > >                             struct radv_pipeline *pipeline)
> > >   {
> > > @@ -646,7 +646,7 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > >               cmd_buffer->sample_positions_needed = true;
> > >
> > >       if (old_pipeline && num_samples == old_pipeline->graphics.ms.num_samples)
> > > -             return;
> > > +             return false;
> > >
> > >       radeon_set_context_reg_seq(cmd_buffer->cs, R_028BDC_PA_SC_LINE_CNTL, 2);
> > >       radeon_emit(cmd_buffer->cs, ms->pa_sc_line_cntl);
> > > @@ -661,6 +661,8 @@ radv_update_multisample_state(struct radv_cmd_buffer *cmd_buffer,
> > >               radeon_emit(cmd_buffer->cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
> > >               radeon_emit(cmd_buffer->cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
> > >       }
> > > +
> > > +     return true;
> > >   }
> > >
> > >   static void
> > > @@ -863,15 +865,15 @@ radv_emit_rbplus_state(struct radv_cmd_buffer *cmd_buffer)
> > >       radeon_emit(cmd_buffer->cs, sx_blend_opt_control);
> > >   }
> > >
> > > -static void
> > > +static bool
> > >   radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >   {
> > >       struct radv_pipeline *pipeline = cmd_buffer->state.pipeline;
> > >
> > >       if (!pipeline || cmd_buffer->state.emitted_pipeline == pipeline)
> > > -             return;
> > > +             return false;
> > >
> > > -     radv_update_multisample_state(cmd_buffer, pipeline);
> > > +     bool context_roll = radv_update_multisample_state(cmd_buffer, pipeline);
> > >
> > >       cmd_buffer->scratch_size_needed =
> > >                                 MAX2(cmd_buffer->scratch_size_needed,
> > > @@ -884,6 +886,15 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >
> > >       radeon_emit_array(cmd_buffer->cs, pipeline->cs.buf, pipeline->cs.cdw);
> > >
> > > +     if (!cmd_buffer->state.emitted_pipeline ||
> > > +         cmd_buffer->state.emitted_pipeline->ctx_cs.cdw != pipeline->ctx_cs.cdw ||
> > > +         cmd_buffer->state.emitted_pipeline->ctx_cs_hash != pipeline->ctx_cs_hash ||
> > > +         memcmp(cmd_buffer->state.emitted_pipeline->ctx_cs.buf,
> > > +                pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw * 4)) {
> > > +             radeon_emit_array(cmd_buffer->cs, pipeline->ctx_cs.buf, pipeline->ctx_cs.cdw);
> > > +             context_roll = true;
> > > +     }
> > > +
> > >       for (unsigned i = 0; i < MESA_SHADER_COMPUTE; i++) {
> > >               if (!pipeline->shaders[i])
> > >                       continue;
> > > @@ -902,6 +913,8 @@ radv_emit_graphics_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >       cmd_buffer->state.emitted_pipeline = pipeline;
> > >
> > >       cmd_buffer->state.dirty &= ~RADV_CMD_DIRTY_PIPELINE;
> > > +
> > > +     return context_roll;
> > >   }
> > >
> > >   static void
> > > @@ -2859,6 +2872,8 @@ radv_emit_compute_pipeline(struct radv_cmd_buffer *cmd_buffer)
> > >       if (!pipeline || pipeline == cmd_buffer->state.emitted_compute_pipeline)
> > >               return;
> > >
> > > +     assert(!pipeline->ctx_cs.cdw);
> > > +
> > >       cmd_buffer->state.emitted_compute_pipeline = pipeline;
> > >
> > >       radeon_check_space(cmd_buffer->device->ws, cmd_buffer->cs, pipeline->cs.cdw);
> > > @@ -3609,30 +3624,30 @@ radv_emit_draw_packets(struct radv_cmd_buffer *cmd_buffer,
> > >    * any context registers.
> > >    */
> > >   static bool radv_need_late_scissor_emission(struct radv_cmd_buffer *cmd_buffer,
> > > -                                            bool indexed_draw)
> > > +                                            bool indexed_draw,
> > > +                                            bool pipeline_context_roll)
> > >   {
> > >       struct radv_cmd_state *state = &cmd_buffer->state;
> > >
> > >       if (!cmd_buffer->device->physical_device->has_scissor_bug)
> > >               return false;
> > >
> > > +     if (pipeline_context_roll)
> > > +             return true;
> > > +
> > >       uint32_t used_states = cmd_buffer->state.pipeline->graphics.needed_dynamic_state | ~RADV_CMD_DIRTY_DYNAMIC_ALL;
> > >
> > >       /* Index, vertex and streamout buffers don't change context regs, and
> > > -      * pipeline is handled later.
> > > +      * pipeline is already handled.
> > >        */
> > >       used_states &= ~(RADV_CMD_DIRTY_INDEX_BUFFER |
> > >                        RADV_CMD_DIRTY_VERTEX_BUFFER |
> > >                        RADV_CMD_DIRTY_STREAMOUT_BUFFER |
> > >                        RADV_CMD_DIRTY_PIPELINE);
> > >
> > > -     /* Assume all state changes except  these two can imply context rolls. */
> > >       if (cmd_buffer->state.dirty & used_states)
> > >               return true;
> > >
> > > -     if (cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
> > > -             return true;
> > > -
> > >       if (indexed_draw && state->pipeline->graphics.prim_restart_enable &&
> > >           (state->index_type ? 0xffffffffu : 0xffffu) != state->last_primitive_reset_index)
> > >               return true;
> > > @@ -3644,14 +3659,19 @@ static void
> > >   radv_emit_all_graphics_states(struct radv_cmd_buffer *cmd_buffer,
> > >                             const struct radv_draw_info *info)
> > >   {
> > > -     bool late_scissor_emission = radv_need_late_scissor_emission(cmd_buffer, info->indexed);
> > > +     bool late_scissor_emission;
> > > +     bool pipeline_context_roll = false;
> > >
> > >       if ((cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER) ||
> > >           cmd_buffer->state.emitted_pipeline != cmd_buffer->state.pipeline)
> > >               radv_emit_rbplus_state(cmd_buffer);
> > >
> > >       if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_PIPELINE)
> > > -             radv_emit_graphics_pipeline(cmd_buffer);
> > > +             pipeline_context_roll = radv_emit_graphics_pipeline(cmd_buffer);
> > > +
> > > +     late_scissor_emission =
> > > +             radv_need_late_scissor_emission(cmd_buffer, info->indexed,
> > > +                                             pipeline_context_roll);
> > >
> > >       if (cmd_buffer->state.dirty & RADV_CMD_DIRTY_FRAMEBUFFER)
> > >               radv_emit_framebuffer_state(cmd_buffer);
> > > diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
> > > index 9d5da43532f..fba77584013 100644
> > > --- a/src/amd/vulkan/radv_pipeline.c
> > > +++ b/src/amd/vulkan/radv_pipeline.c
> > > @@ -2525,7 +2525,7 @@ radv_compute_bin_size(struct radv_pipeline *pipeline, const VkGraphicsPipelineCr
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_binning_state(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_binning_state(struct radeon_cmdbuf *ctx_cs,
> > >                                    struct radv_pipeline *pipeline,
> > >                                    const VkGraphicsPipelineCreateInfo *pCreateInfo)
> > >   {
> > > @@ -2575,15 +2575,15 @@ radv_pipeline_generate_binning_state(struct radeon_cmdbuf *cs,
> > >                       S_028C44_OPTIMAL_BIN_SELECTION(1);
> > >       }
> > >
> > > -     radeon_set_context_reg(cs, R_028C44_PA_SC_BINNER_CNTL_0,
> > > +     radeon_set_context_reg(ctx_cs, R_028C44_PA_SC_BINNER_CNTL_0,
> > >                              pa_sc_binner_cntl_0);
> > > -     radeon_set_context_reg(cs, R_028060_DB_DFSM_CONTROL,
> > > +     radeon_set_context_reg(ctx_cs, R_028060_DB_DFSM_CONTROL,
> > >                              db_dfsm_control);
> > >   }
> > >
> > >
> > >   static void
> > > -radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *ctx_cs,
> > >                                              struct radv_pipeline *pipeline,
> > >                                              const VkGraphicsPipelineCreateInfo *pCreateInfo,
> > >                                              const struct radv_graphics_pipeline_create_info *extra)
> > > @@ -2656,35 +2656,35 @@ radv_pipeline_generate_depth_stencil_state(struct radeon_cmdbuf *cs,
> > >               db_render_override |= S_02800C_DISABLE_VIEWPORT_CLAMP(1);
> > >       }
> > >
> > > -     radeon_set_context_reg(cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
> > > -     radeon_set_context_reg(cs, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
> > > +     radeon_set_context_reg(ctx_cs, R_028800_DB_DEPTH_CONTROL, db_depth_control);
> > > +     radeon_set_context_reg(ctx_cs, R_02842C_DB_STENCIL_CONTROL, db_stencil_control);
> > >
> > > -     radeon_set_context_reg(cs, R_028000_DB_RENDER_CONTROL, db_render_control);
> > > -     radeon_set_context_reg(cs, R_02800C_DB_RENDER_OVERRIDE, db_render_override);
> > > -     radeon_set_context_reg(cs, R_028010_DB_RENDER_OVERRIDE2, db_render_override2);
> > > +     radeon_set_context_reg(ctx_cs, R_028000_DB_RENDER_CONTROL, db_render_control);
> > > +     radeon_set_context_reg(ctx_cs, R_02800C_DB_RENDER_OVERRIDE, db_render_override);
> > > +     radeon_set_context_reg(ctx_cs, R_028010_DB_RENDER_OVERRIDE2, db_render_override2);
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_blend_state(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_blend_state(struct radeon_cmdbuf *ctx_cs,
> > >                                      struct radv_pipeline *pipeline,
> > >                                      const struct radv_blend_state *blend)
> > >   {
> > > -     radeon_set_context_reg_seq(cs, R_028780_CB_BLEND0_CONTROL, 8);
> > > -     radeon_emit_array(cs, blend->cb_blend_control,
> > > +     radeon_set_context_reg_seq(ctx_cs, R_028780_CB_BLEND0_CONTROL, 8);
> > > +     radeon_emit_array(ctx_cs, blend->cb_blend_control,
> > >                         8);
> > > -     radeon_set_context_reg(cs, R_028808_CB_COLOR_CONTROL, blend->cb_color_control);
> > > -     radeon_set_context_reg(cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
> > > +     radeon_set_context_reg(ctx_cs, R_028808_CB_COLOR_CONTROL, blend->cb_color_control);
> > > +     radeon_set_context_reg(ctx_cs, R_028B70_DB_ALPHA_TO_MASK, blend->db_alpha_to_mask);
> > >
> > >       if (pipeline->device->physical_device->has_rbplus) {
> > >
> > > -             radeon_set_context_reg_seq(cs, R_028760_SX_MRT0_BLEND_OPT, 8);
> > > -             radeon_emit_array(cs, blend->sx_mrt_blend_opt, 8);
> > > +             radeon_set_context_reg_seq(ctx_cs, R_028760_SX_MRT0_BLEND_OPT, 8);
> > > +             radeon_emit_array(ctx_cs, blend->sx_mrt_blend_opt, 8);
> > >       }
> > >
> > > -     radeon_set_context_reg(cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
> > > +     radeon_set_context_reg(ctx_cs, R_028714_SPI_SHADER_COL_FORMAT, blend->spi_shader_col_format);
> > >
> > > -     radeon_set_context_reg(cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask);
> > > -     radeon_set_context_reg(cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
> > > +     radeon_set_context_reg(ctx_cs, R_028238_CB_TARGET_MASK, blend->cb_target_mask);
> > > +     radeon_set_context_reg(ctx_cs, R_02823C_CB_SHADER_MASK, blend->cb_shader_mask);
> > >
> > >       pipeline->graphics.col_format = blend->spi_shader_col_format;
> > >       pipeline->graphics.cb_target_mask = blend->cb_target_mask;
> > > @@ -2702,7 +2702,7 @@ radv_get_conservative_raster_mode(const VkPipelineRasterizationStateCreateInfo *
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_raster_state(struct radeon_cmdbuf *ctx_cs,
> > >                                   struct radv_pipeline *pipeline,
> > >                                       const VkGraphicsPipelineCreateInfo *pCreateInfo)
> > >   {
> > > @@ -2711,14 +2711,14 @@ radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs,
> > >               radv_get_conservative_raster_mode(vkraster);
> > >       uint32_t pa_sc_conservative_rast = S_028C4C_NULL_SQUAD_AA_MASK_ENABLE(1);
> > >
> > > -     radeon_set_context_reg(cs, R_028810_PA_CL_CLIP_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028810_PA_CL_CLIP_CNTL,
> > >                              S_028810_DX_CLIP_SPACE_DEF(1) | // vulkan uses DX conventions.
> > >                              S_028810_ZCLIP_NEAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) |
> > >                              S_028810_ZCLIP_FAR_DISABLE(vkraster->depthClampEnable ? 1 : 0) |
> > >                              S_028810_DX_RASTERIZATION_KILL(vkraster->rasterizerDiscardEnable ? 1 : 0) |
> > >                              S_028810_DX_LINEAR_ATTR_CLIP_ENA(1));
> > >
> > > -     radeon_set_context_reg(cs, R_0286D4_SPI_INTERP_CONTROL_0,
> > > +     radeon_set_context_reg(ctx_cs, R_0286D4_SPI_INTERP_CONTROL_0,
> > >                              S_0286D4_FLAT_SHADE_ENA(1) |
> > >                              S_0286D4_PNT_SPRITE_ENA(1) |
> > >                              S_0286D4_PNT_SPRITE_OVRD_X(V_0286D4_SPI_PNT_SPRITE_SEL_S) |
> > > @@ -2727,12 +2727,12 @@ radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs,
> > >                              S_0286D4_PNT_SPRITE_OVRD_W(V_0286D4_SPI_PNT_SPRITE_SEL_1) |
> > >                              S_0286D4_PNT_SPRITE_TOP_1(0)); /* vulkan is top to bottom - 1.0 at bottom */
> > >
> > > -     radeon_set_context_reg(cs, R_028BE4_PA_SU_VTX_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028BE4_PA_SU_VTX_CNTL,
> > >                              S_028BE4_PIX_CENTER(1) | // TODO verify
> > >                              S_028BE4_ROUND_MODE(V_028BE4_X_ROUND_TO_EVEN) |
> > >                              S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH));
> > >
> > > -     radeon_set_context_reg(cs, R_028814_PA_SU_SC_MODE_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028814_PA_SU_SC_MODE_CNTL,
> > >                              S_028814_FACE(vkraster->frontFace) |
> > >                              S_028814_CULL_FRONT(!!(vkraster->cullMode & VK_CULL_MODE_FRONT_BIT)) |
> > >                              S_028814_CULL_BACK(!!(vkraster->cullMode & VK_CULL_MODE_BACK_BIT)) |
> > > @@ -2773,37 +2773,37 @@ radv_pipeline_generate_raster_state(struct radeon_cmdbuf *cs,
> > >               }
> > >       }
> > >
> > > -     radeon_set_context_reg(cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028C4C_PA_SC_CONSERVATIVE_RASTERIZATION_CNTL,
> > >                                  pa_sc_conservative_rast);
> > >   }
> > >
> > >
> > >   static void
> > > -radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_multisample_state(struct radeon_cmdbuf *ctx_cs,
> > >                                            struct radv_pipeline *pipeline)
> > >   {
> > >       struct radv_multisample_state *ms = &pipeline->graphics.ms;
> > >
> > > -     radeon_set_context_reg_seq(cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
> > > -     radeon_emit(cs, ms->pa_sc_aa_mask[0]);
> > > -     radeon_emit(cs, ms->pa_sc_aa_mask[1]);
> > > +     radeon_set_context_reg_seq(ctx_cs, R_028C38_PA_SC_AA_MASK_X0Y0_X1Y0, 2);
> > > +     radeon_emit(ctx_cs, ms->pa_sc_aa_mask[0]);
> > > +     radeon_emit(ctx_cs, ms->pa_sc_aa_mask[1]);
> > >
> > > -     radeon_set_context_reg(cs, R_028804_DB_EQAA, ms->db_eqaa);
> > > -     radeon_set_context_reg(cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
> > > +     radeon_set_context_reg(ctx_cs, R_028804_DB_EQAA, ms->db_eqaa);
> > > +     radeon_set_context_reg(ctx_cs, R_028A4C_PA_SC_MODE_CNTL_1, ms->pa_sc_mode_cntl_1);
> > >
> > >       /* The exclusion bits can be set to improve rasterization efficiency
> > >        * if no sample lies on the pixel boundary (-8 sample offset). It's
> > >        * currently always TRUE because the driver doesn't support 16 samples.
> > >        */
> > >       bool exclusion = pipeline->device->physical_device->rad_info.chip_class >= CIK;
> > > -     radeon_set_context_reg(cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_02882C_PA_SU_PRIM_FILTER_CNTL,
> > >                              S_02882C_XMAX_RIGHT_EXCLUSION(exclusion) |
> > >                              S_02882C_YMAX_BOTTOM_EXCLUSION(exclusion));
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *cs,
> > > -                                   const struct radv_pipeline *pipeline)
> > > +radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *ctx_cs,
> > > +                                   struct radv_pipeline *pipeline)
> > >   {
> > >       const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
> > >
> > > @@ -2821,12 +2821,13 @@ radv_pipeline_generate_vgt_gs_mode(struct radeon_cmdbuf *cs,
> > >               vgt_primitiveid_en = true;
> > >       }
> > >
> > > -     radeon_set_context_reg(cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
> > > -     radeon_set_context_reg(cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
> > > +     radeon_set_context_reg(ctx_cs, R_028A84_VGT_PRIMITIVEID_EN, vgt_primitiveid_en);
> > > +     radeon_set_context_reg(ctx_cs, R_028A40_VGT_GS_MODE, vgt_gs_mode);
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *ctx_cs,
> > > +                          struct radeon_cmdbuf *cs,
> > >                            struct radv_pipeline *pipeline,
> > >                            struct radv_shader_variant *shader)
> > >   {
> > > @@ -2847,10 +2848,10 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs,
> > >               outinfo->writes_layer ||
> > >               outinfo->writes_viewport_index;
> > >
> > > -     radeon_set_context_reg(cs, R_0286C4_SPI_VS_OUT_CONFIG,
> > > +     radeon_set_context_reg(ctx_cs, R_0286C4_SPI_VS_OUT_CONFIG,
> > >                              S_0286C4_VS_EXPORT_COUNT(MAX2(1, outinfo->param_exports) - 1));
> > >
> > > -     radeon_set_context_reg(cs, R_02870C_SPI_SHADER_POS_FORMAT,
> > > +     radeon_set_context_reg(ctx_cs, R_02870C_SPI_SHADER_POS_FORMAT,
> > >                              S_02870C_POS0_EXPORT_FORMAT(V_02870C_SPI_SHADER_4COMP) |
> > >                              S_02870C_POS1_EXPORT_FORMAT(outinfo->pos_exports > 1 ?
> > >                                                          V_02870C_SPI_SHADER_4COMP :
> > > @@ -2862,13 +2863,13 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs,
> > >                                                          V_02870C_SPI_SHADER_4COMP :
> > >                                                          V_02870C_SPI_SHADER_NONE));
> > >
> > > -     radeon_set_context_reg(cs, R_028818_PA_CL_VTE_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028818_PA_CL_VTE_CNTL,
> > >                              S_028818_VTX_W0_FMT(1) |
> > >                              S_028818_VPORT_X_SCALE_ENA(1) | S_028818_VPORT_X_OFFSET_ENA(1) |
> > >                              S_028818_VPORT_Y_SCALE_ENA(1) | S_028818_VPORT_Y_OFFSET_ENA(1) |
> > >                              S_028818_VPORT_Z_SCALE_ENA(1) | S_028818_VPORT_Z_OFFSET_ENA(1));
> > >
> > > -     radeon_set_context_reg(cs, R_02881C_PA_CL_VS_OUT_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_02881C_PA_CL_VS_OUT_CNTL,
> > >                              S_02881C_USE_VTX_POINT_SIZE(outinfo->writes_pointsize) |
> > >                              S_02881C_USE_VTX_RENDER_TARGET_INDX(outinfo->writes_layer) |
> > >                              S_02881C_USE_VTX_VIEWPORT_INDX(outinfo->writes_viewport_index) |
> > > @@ -2880,7 +2881,7 @@ radv_pipeline_generate_hw_vs(struct radeon_cmdbuf *cs,
> > >                              clip_dist_mask);
> > >
> > >       if (pipeline->device->physical_device->rad_info.chip_class <= VI)
> > > -             radeon_set_context_reg(cs, R_028AB4_VGT_REUSE_OFF,
> > > +             radeon_set_context_reg(ctx_cs, R_028AB4_VGT_REUSE_OFF,
> > >                                      outinfo->writes_viewport_index);
> > >   }
> > >
> > > @@ -2948,7 +2949,8 @@ radv_pipeline_generate_hw_hs(struct radeon_cmdbuf *cs,
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *ctx_cs,
> > > +                                  struct radeon_cmdbuf *cs,
> > >                                    struct radv_pipeline *pipeline,
> > >                                    const struct radv_tessellation_state *tess)
> > >   {
> > > @@ -2964,11 +2966,12 @@ radv_pipeline_generate_vertex_shader(struct radeon_cmdbuf *cs,
> > >       else if (vs->info.vs.as_es)
> > >               radv_pipeline_generate_hw_es(cs, pipeline, vs);
> > >       else
> > > -             radv_pipeline_generate_hw_vs(cs, pipeline, vs);
> > > +             radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, vs);
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *ctx_cs,
> > > +                                 struct radeon_cmdbuf *cs,
> > >                                   struct radv_pipeline *pipeline,
> > >                                   const struct radv_tessellation_state *tess)
> > >   {
> > > @@ -2984,24 +2987,25 @@ radv_pipeline_generate_tess_shaders(struct radeon_cmdbuf *cs,
> > >               if (tes->info.tes.as_es)
> > >                       radv_pipeline_generate_hw_es(cs, pipeline, tes);
> > >               else
> > > -                     radv_pipeline_generate_hw_vs(cs, pipeline, tes);
> > > +                     radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, tes);
> > >       }
> > >
> > >       radv_pipeline_generate_hw_hs(cs, pipeline, tcs, tess);
> > >
> > > -     radeon_set_context_reg(cs, R_028B6C_VGT_TF_PARAM,
> > > +     radeon_set_context_reg(ctx_cs, R_028B6C_VGT_TF_PARAM,
> > >                              tess->tf_param);
> > >
> > >       if (pipeline->device->physical_device->rad_info.chip_class >= CIK)
> > > -             radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
> > > +             radeon_set_context_reg_idx(ctx_cs, R_028B58_VGT_LS_HS_CONFIG, 2,
> > >                                          tess->ls_hs_config);
> > >       else
> > > -             radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
> > > +             radeon_set_context_reg(ctx_cs, R_028B58_VGT_LS_HS_CONFIG,
> > >                                      tess->ls_hs_config);
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *ctx_cs,
> > > +                                    struct radeon_cmdbuf *cs,
> > >                                      struct radv_pipeline *pipeline,
> > >                                      const struct radv_gs_state *gs_state)
> > >   {
> > > @@ -3022,32 +3026,32 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs,
> > >
> > >       offset = num_components[0] * gs_max_out_vertices;
> > >
> > > -     radeon_set_context_reg_seq(cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
> > > -     radeon_emit(cs, offset);
> > > +     radeon_set_context_reg_seq(ctx_cs, R_028A60_VGT_GSVS_RING_OFFSET_1, 3);
> > > +     radeon_emit(ctx_cs, offset);
> > >       if (max_stream >= 1)
> > >               offset += num_components[1] * gs_max_out_vertices;
> > > -     radeon_emit(cs, offset);
> > > +     radeon_emit(ctx_cs, offset);
> > >       if (max_stream >= 2)
> > >               offset += num_components[2] * gs_max_out_vertices;
> > > -     radeon_emit(cs, offset);
> > > +     radeon_emit(ctx_cs, offset);
> > >       if (max_stream >= 3)
> > >               offset += num_components[3] * gs_max_out_vertices;
> > > -     radeon_set_context_reg(cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
> > > +     radeon_set_context_reg(ctx_cs, R_028AB0_VGT_GSVS_RING_ITEMSIZE, offset);
> > >
> > > -     radeon_set_context_reg(cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
> > > +     radeon_set_context_reg(ctx_cs, R_028B38_VGT_GS_MAX_VERT_OUT, gs->info.gs.vertices_out);
> > >
> > > -     radeon_set_context_reg_seq(cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
> > > -     radeon_emit(cs, num_components[0]);
> > > -     radeon_emit(cs, (max_stream >= 1) ? num_components[1] : 0);
> > > -     radeon_emit(cs, (max_stream >= 2) ? num_components[2] : 0);
> > > -     radeon_emit(cs, (max_stream >= 3) ? num_components[3] : 0);
> > > +     radeon_set_context_reg_seq(ctx_cs, R_028B5C_VGT_GS_VERT_ITEMSIZE, 4);
> > > +     radeon_emit(ctx_cs, num_components[0]);
> > > +     radeon_emit(ctx_cs, (max_stream >= 1) ? num_components[1] : 0);
> > > +     radeon_emit(ctx_cs, (max_stream >= 2) ? num_components[2] : 0);
> > > +     radeon_emit(ctx_cs, (max_stream >= 3) ? num_components[3] : 0);
> > >
> > >       uint32_t gs_num_invocations = gs->info.gs.invocations;
> > > -     radeon_set_context_reg(cs, R_028B90_VGT_GS_INSTANCE_CNT,
> > > +     radeon_set_context_reg(ctx_cs, R_028B90_VGT_GS_INSTANCE_CNT,
> > >                              S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
> > >                              S_028B90_ENABLE(gs_num_invocations > 0));
> > >
> > > -     radeon_set_context_reg(cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
> > > +     radeon_set_context_reg(ctx_cs, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
> > >                              gs_state->vgt_esgs_ring_itemsize);
> > >
> > >       va = radv_buffer_get_va(gs->bo) + gs->bo_offset;
> > > @@ -3061,8 +3065,8 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs,
> > >               radeon_emit(cs, gs->rsrc1);
> > >               radeon_emit(cs, gs->rsrc2 | S_00B22C_LDS_SIZE(gs_state->lds_size));
> > >
> > > -             radeon_set_context_reg(cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
> > > -             radeon_set_context_reg(cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, gs_state->vgt_gs_max_prims_per_subgroup);
> > > +             radeon_set_context_reg(ctx_cs, R_028A44_VGT_GS_ONCHIP_CNTL, gs_state->vgt_gs_onchip_cntl);
> > > +             radeon_set_context_reg(ctx_cs, R_028A94_VGT_GS_MAX_PRIMS_PER_SUBGROUP, gs_state->vgt_gs_max_prims_per_subgroup);
> > >       } else {
> > >               radeon_set_sh_reg_seq(cs, R_00B220_SPI_SHADER_PGM_LO_GS, 4);
> > >               radeon_emit(cs, va >> 8);
> > > @@ -3071,7 +3075,7 @@ radv_pipeline_generate_geometry_shader(struct radeon_cmdbuf *cs,
> > >               radeon_emit(cs, gs->rsrc2);
> > >       }
> > >
> > > -     radv_pipeline_generate_hw_vs(cs, pipeline, pipeline->gs_copy_shader);
> > > +     radv_pipeline_generate_hw_vs(ctx_cs, cs, pipeline, pipeline->gs_copy_shader);
> > >   }
> > >
> > >   static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
> > > @@ -3093,8 +3097,8 @@ static uint32_t offset_to_ps_input(uint32_t offset, bool flat_shade)
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *cs,
> > > -                                 struct radv_pipeline *pipeline)
> > > +radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *ctx_cs,
> > > +                              struct radv_pipeline *pipeline)
> > >   {
> > >       struct radv_shader_variant *ps = pipeline->shaders[MESA_SHADER_FRAGMENT];
> > >       const struct radv_vs_output_info *outinfo = get_vs_output_info(pipeline);
> > > @@ -3165,9 +3169,9 @@ radv_pipeline_generate_ps_inputs(struct radeon_cmdbuf *cs,
> > >       }
> > >
> > >       if (ps_offset) {
> > > -             radeon_set_context_reg_seq(cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
> > > +             radeon_set_context_reg_seq(ctx_cs, R_028644_SPI_PS_INPUT_CNTL_0, ps_offset);
> > >               for (unsigned i = 0; i < ps_offset; i++) {
> > > -                     radeon_emit(cs, ps_input_cntl[i]);
> > > +                     radeon_emit(ctx_cs, ps_input_cntl[i]);
> > >               }
> > >       }
> > >   }
> > > @@ -3205,7 +3209,8 @@ radv_compute_db_shader_control(const struct radv_device *device,
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *ctx_cs,
> > > +                                    struct radeon_cmdbuf *cs,
> > >                                      struct radv_pipeline *pipeline)
> > >   {
> > >       struct radv_shader_variant *ps;
> > > @@ -3221,22 +3226,22 @@ radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *cs,
> > >       radeon_emit(cs, ps->rsrc1);
> > >       radeon_emit(cs, ps->rsrc2);
> > >
> > > -     radeon_set_context_reg(cs, R_02880C_DB_SHADER_CONTROL,
> > > +     radeon_set_context_reg(ctx_cs, R_02880C_DB_SHADER_CONTROL,
> > >                              radv_compute_db_shader_control(pipeline->device,
> > >                                                             pipeline, ps));
> > >
> > > -     radeon_set_context_reg(cs, R_0286CC_SPI_PS_INPUT_ENA,
> > > +     radeon_set_context_reg(ctx_cs, R_0286CC_SPI_PS_INPUT_ENA,
> > >                              ps->config.spi_ps_input_ena);
> > >
> > > -     radeon_set_context_reg(cs, R_0286D0_SPI_PS_INPUT_ADDR,
> > > +     radeon_set_context_reg(ctx_cs, R_0286D0_SPI_PS_INPUT_ADDR,
> > >                              ps->config.spi_ps_input_addr);
> > >
> > > -     radeon_set_context_reg(cs, R_0286D8_SPI_PS_IN_CONTROL,
> > > +     radeon_set_context_reg(ctx_cs, R_0286D8_SPI_PS_IN_CONTROL,
> > >                              S_0286D8_NUM_INTERP(ps->info.fs.num_interp));
> > >
> > > -     radeon_set_context_reg(cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
> > > +     radeon_set_context_reg(ctx_cs, R_0286E0_SPI_BARYC_CNTL, pipeline->graphics.spi_baryc_cntl);
> > >
> > > -     radeon_set_context_reg(cs, R_028710_SPI_SHADER_Z_FORMAT,
> > > +     radeon_set_context_reg(ctx_cs, R_028710_SPI_SHADER_Z_FORMAT,
> > >                              ac_get_spi_shader_z_format(ps->info.info.ps.writes_z,
> > >                                                         ps->info.info.ps.writes_stencil,
> > >                                                         ps->info.info.ps.writes_sample_mask));
> > > @@ -3249,7 +3254,7 @@ radv_pipeline_generate_fragment_shader(struct radeon_cmdbuf *cs,
> > >   }
> > >
> > >   static void
> > > -radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *cs,
> > > +radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *ctx_cs,
> > >                                       struct radv_pipeline *pipeline)
> > >   {
> > >       if (pipeline->device->physical_device->rad_info.family < CHIP_POLARIS10)
> > > @@ -3260,7 +3265,7 @@ radv_pipeline_generate_vgt_vertex_reuse(struct radeon_cmdbuf *cs,
> > >           radv_get_shader(pipeline, MESA_SHADER_TESS_EVAL)->info.tes.spacing == TESS_SPACING_FRACTIONAL_ODD) {
> > >               vtx_reuse_depth = 14;
> > >       }
> > > -     radeon_set_context_reg(cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
> > > +     radeon_set_context_reg(ctx_cs, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
> > >                              S_028C58_VTX_REUSE_DEPTH(vtx_reuse_depth));
> > >   }
> > >
> > > @@ -3330,38 +3335,46 @@ radv_pipeline_generate_pm4(struct radv_pipeline *pipeline,
> > >                              const struct radv_gs_state *gs,
> > >                              unsigned prim, unsigned gs_out)
> > >   {
> > > -     pipeline->cs.buf = malloc(4 * 256);
> > > -     pipeline->cs.max_dw = 256;
> > > -
> > > -     radv_pipeline_generate_depth_stencil_state(&pipeline->cs, pipeline, pCreateInfo, extra);
> > > -     radv_pipeline_generate_blend_state(&pipeline->cs, pipeline, blend);
> > > -     radv_pipeline_generate_raster_state(&pipeline->cs, pipeline, pCreateInfo);
> > > -     radv_pipeline_generate_multisample_state(&pipeline->cs, pipeline);
> > > -     radv_pipeline_generate_vgt_gs_mode(&pipeline->cs, pipeline);
> > > -     radv_pipeline_generate_vertex_shader(&pipeline->cs, pipeline, tess);
> > > -     radv_pipeline_generate_tess_shaders(&pipeline->cs, pipeline, tess);
> > > -     radv_pipeline_generate_geometry_shader(&pipeline->cs, pipeline, gs);
> > > -     radv_pipeline_generate_fragment_shader(&pipeline->cs, pipeline);
> > > -     radv_pipeline_generate_ps_inputs(&pipeline->cs, pipeline);
> > > -     radv_pipeline_generate_vgt_vertex_reuse(&pipeline->cs, pipeline);
> > > -     radv_pipeline_generate_binning_state(&pipeline->cs, pipeline, pCreateInfo);
> > > -
> > > -     radeon_set_context_reg(&pipeline->cs, R_0286E8_SPI_TMPRING_SIZE,
> > > +     struct radeon_cmdbuf *ctx_cs = &pipeline->ctx_cs;
> > > +     struct radeon_cmdbuf *cs = &pipeline->cs;
> > > +
> > > +     cs->max_dw = 64;
> > > +     ctx_cs->max_dw = 256;
> > > +     cs->buf = malloc(4 * (cs->max_dw + ctx_cs->max_dw));
> > > +     ctx_cs->buf = cs->buf + cs->max_dw;
> > > +
> > > +     radv_pipeline_generate_depth_stencil_state(ctx_cs, pipeline, pCreateInfo, extra);
> > > +     radv_pipeline_generate_blend_state(ctx_cs, pipeline, blend);
> > > +     radv_pipeline_generate_raster_state(ctx_cs, pipeline, pCreateInfo);
> > > +     radv_pipeline_generate_multisample_state(ctx_cs, pipeline);
> > > +     radv_pipeline_generate_vgt_gs_mode(ctx_cs, pipeline);
> > > +     radv_pipeline_generate_vertex_shader(ctx_cs, cs, pipeline, tess);
> > > +     radv_pipeline_generate_tess_shaders(ctx_cs, cs, pipeline, tess);
> > > +     radv_pipeline_generate_geometry_shader(ctx_cs, cs, pipeline, gs);
> > > +     radv_pipeline_generate_fragment_shader(ctx_cs, cs, pipeline);
> > > +     radv_pipeline_generate_ps_inputs(ctx_cs, pipeline);
> > > +     radv_pipeline_generate_vgt_vertex_reuse(ctx_cs, pipeline);
> > > +     radv_pipeline_generate_binning_state(ctx_cs, pipeline, pCreateInfo);
> > > +
> > > +     radeon_set_context_reg(ctx_cs, R_0286E8_SPI_TMPRING_SIZE,
> > >                              S_0286E8_WAVES(pipeline->max_waves) |
> > >                              S_0286E8_WAVESIZE(pipeline->scratch_bytes_per_wave >> 10));
> > >
> > > -     radeon_set_context_reg(&pipeline->cs, R_028B54_VGT_SHADER_STAGES_EN, radv_compute_vgt_shader_stages_en(pipeline));
> > > +     radeon_set_context_reg(ctx_cs, R_028B54_VGT_SHADER_STAGES_EN, radv_compute_vgt_shader_stages_en(pipeline));
> > >
> > >       if (pipeline->device->physical_device->rad_info.chip_class >= CIK) {
> > > -             radeon_set_uconfig_reg_idx(&pipeline->cs, R_030908_VGT_PRIMITIVE_TYPE, 1, prim);
> > > +             radeon_set_uconfig_reg_idx(cs, R_030908_VGT_PRIMITIVE_TYPE, 1, prim);
> > >       } else {
> > > -             radeon_set_config_reg(&pipeline->cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
> > > +             radeon_set_config_reg(cs, R_008958_VGT_PRIMITIVE_TYPE, prim);
> > >       }
> > > -     radeon_set_context_reg(&pipeline->cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
> > > +     radeon_set_context_reg(ctx_cs, R_028A6C_VGT_GS_OUT_PRIM_TYPE, gs_out);
> > >
> > > -     radeon_set_context_reg(&pipeline->cs, R_02820C_PA_SC_CLIPRECT_RULE, radv_compute_cliprect_rule(pCreateInfo));
> > > +     radeon_set_context_reg(ctx_cs, R_02820C_PA_SC_CLIPRECT_RULE, radv_compute_cliprect_rule(pCreateInfo));
> > >
> > > -     assert(pipeline->cs.cdw <= pipeline->cs.max_dw);
> > > +     pipeline->ctx_cs_hash = _mesa_hash_data(ctx_cs->buf, ctx_cs->cdw * 4);
> > > +
> > > +     assert(ctx_cs->cdw <= ctx_cs->max_dw);
> > > +     assert(cs->cdw <= cs->max_dw);
> > >   }
> > >
> > >   static struct radv_ia_multi_vgt_param_helpers
> > > diff --git a/src/amd/vulkan/radv_private.h b/src/amd/vulkan/radv_private.h
> > > index 6089ee6a607..f6534ae3309 100644
> > > --- a/src/amd/vulkan/radv_private.h
> > > +++ b/src/amd/vulkan/radv_private.h
> > > @@ -1365,6 +1365,8 @@ struct radv_pipeline {
> > >       VkShaderStageFlags                           active_stages;
> > >
> > >       struct radeon_cmdbuf                      cs;
> > > +     uint32_t                                  ctx_cs_hash;
> > > +     struct radeon_cmdbuf                      ctx_cs;
> > >
> > >       struct radv_vertex_elements_info             vertex_elements;
> > >


More information about the mesa-dev mailing list