<div dir="auto">Thanks. It looks good.<div dir="auto"><br></div><div dir="auto">Marek</div></div><br><div class="gmail_quote"><div dir="ltr" class="gmail_attr">On Thu, Apr 25, 2019, 5:17 PM Dylan Baker <<a href="mailto:dylan@pnwbakers.com">dylan@pnwbakers.com</a>> wrote:<br></div><blockquote class="gmail_quote" style="margin:0 0 0 .8ex;border-left:1px #ccc solid;padding-left:1ex">Hi Marek,<br>
<br>
I've tried to apply this to 19.0, I had to pull "radeonsi: add si_debug_options<br>
for convenient adding/removing of options", which is fine, but this patch also<br>
assumes your si compute-queue only patches, which aren't present in 19.0. I've<br>
made a small change to get it compiling, but I'm sure it's not the right fix, so<br>
if you could take a look at the staging/19.0 branch and let me know what you'd<br>
like to do I'd appreciate it.<br>
<br>
Thanks,<br>
Dylan<br>
<br>
Quoting Marek Olšák (2019-04-18 14:46:27)<br>
> From: Marek Olšák <<a href="mailto:marek.olsak@amd.com" target="_blank" rel="noreferrer">marek.olsak@amd.com</a>><br>
> <br>
> Needed to track context rolls caused by streamout and ACQUIRE_MEM.<br>
> ACQUIRE_MEM can occur outside of draw calls.<br>
> <br>
> Bugzilla: <a href="https://bugs.freedesktop.org/show_bug.cgi?id=110355" rel="noreferrer noreferrer" target="_blank">https://bugs.freedesktop.org/show_bug.cgi?id=110355</a><br>
> <br>
> v2: squashed patches and done more rework<br>
> <br>
> Cc: 19.0 <<a href="mailto:mesa-stable@lists.freedesktop.org" target="_blank" rel="noreferrer">mesa-stable@lists.freedesktop.org</a>><br>
> ---<br>
>  src/gallium/drivers/radeonsi/si_pipe.c        |  2 +<br>
>  src/gallium/drivers/radeonsi/si_pipe.h        |  3 +-<br>
>  src/gallium/drivers/radeonsi/si_state.c       |  8 +-<br>
>  .../drivers/radeonsi/si_state_binning.c       |  4 +-<br>
>  src/gallium/drivers/radeonsi/si_state_draw.c  | 86 +++++++++++--------<br>
>  .../drivers/radeonsi/si_state_shaders.c       | 10 +--<br>
>  .../drivers/radeonsi/si_state_streamout.c     |  1 +<br>
>  .../drivers/radeonsi/si_state_viewport.c      |  2 +-<br>
>  8 files changed, 68 insertions(+), 48 deletions(-)<br>
> <br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c<br>
> index fa96ce34224..7209db9fb37 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c<br>
> @@ -1072,20 +1072,22 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,<br>
>  <br>
>         sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI &&<br>
>                                          sscreen->info.max_se >= 2 &&<br>
>                                          !(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));<br>
>         sscreen->assume_no_z_fights =<br>
>                 driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");<br>
>         sscreen->commutative_blend_add =<br>
>                 driQueryOptionb(config->options, "radeonsi_commutative_blend_add");<br>
>         sscreen->clear_db_cache_before_clear =<br>
>                 driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear");<br>
> +       sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 ||<br>
> +                                       sscreen->info.family == CHIP_RAVEN;<br>
>         sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 &&<br>
>                                             sscreen->info.family <= CHIP_POLARIS12) ||<br>
>                                            sscreen->info.family == CHIP_VEGA10 ||<br>
>                                            sscreen->info.family == CHIP_RAVEN;<br>
>         sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 ||<br>
>                                         sscreen->info.family == CHIP_RAVEN;<br>
>         sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2;<br>
>  <br>
>         /* Only enable primitive binning on APUs by default. */<br>
>         sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN ||<br>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> index aaa95f32d20..a4c90a4f69f 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_pipe.h<br>
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h<br>
> @@ -463,20 +463,21 @@ struct si_screen {<br>
>         unsigned                        eqaa_force_coverage_samples;<br>
>         unsigned                        eqaa_force_z_samples;<br>
>         unsigned                        eqaa_force_color_samples;<br>
>         bool                            has_clear_state;<br>
>         bool                            has_distributed_tess;<br>
>         bool                            has_draw_indirect_multi;<br>
>         bool                            has_out_of_order_rast;<br>
>         bool                            assume_no_z_fights;<br>
>         bool                            commutative_blend_add;<br>
>         bool                            clear_db_cache_before_clear;<br>
> +       bool                            has_gfx9_scissor_bug;<br>
>         bool                            has_msaa_sample_loc_bug;<br>
>         bool                            has_ls_vgpr_init_bug;<br>
>         bool                            has_dcc_constant_encode;<br>
>         bool                            dpbb_allowed;<br>
>         bool                            dfsm_allowed;<br>
>         bool                            llvm_has_working_vgpr_indexing;<br>
>  <br>
>         /* Whether shaders are monolithic (1-part) or separate (3-part). */<br>
>         bool                            use_monolithic_shaders;<br>
>         bool                            record_llvm_ir;<br>
> @@ -1062,21 +1063,21 @@ struct si_context {<br>
>         unsigned                        num_vs_flushes;<br>
>         unsigned                        num_ps_flushes;<br>
>         unsigned                        num_cs_flushes;<br>
>         unsigned                        num_cb_cache_flushes;<br>
>         unsigned                        num_db_cache_flushes;<br>
>         unsigned                        num_L2_invalidates;<br>
>         unsigned                        num_L2_writebacks;<br>
>         unsigned                        num_resident_handles;<br>
>         uint64_t                        num_alloc_tex_transfer_bytes;<br>
>         unsigned                        last_tex_ps_draw_ratio; /* for query */<br>
> -       unsigned                        context_roll_counter;<br>
> +       unsigned                        context_roll;<br>
>  <br>
>         /* Queries. */<br>
>         /* Maintain the list of active queries for pausing between IBs. */<br>
>         int                             num_occlusion_queries;<br>
>         int                             num_perfect_occlusion_queries;<br>
>         struct list_head                active_queries;<br>
>         unsigned                        num_cs_dw_queries_suspend;<br>
>  <br>
>         /* Render condition. */<br>
>         struct pipe_query               *render_cond;<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c<br>
> index 757c17f7df8..bc7e777ad73 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state.c<br>
> @@ -249,21 +249,21 @@ static void si_emit_cb_render_state(struct si_context *sctx)<br>
>                         }<br>
>                 }<br>
>  <br>
>                 /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */<br>
>                 radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,<br>
>                                             SI_TRACKED_SX_PS_DOWNCONVERT,<br>
>                                             sx_ps_downconvert, sx_blend_opt_epsilon,<br>
>                                             sx_blend_opt_control);<br>
>         }<br>
>         if (initial_cdw != cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  /*<br>
>   * Blender functions<br>
>   */<br>
>  <br>
>  static uint32_t si_translate_blend_function(int blend_func)<br>
>  {<br>
>         switch (blend_func) {<br>
>         case PIPE_BLEND_ADD:<br>
> @@ -786,21 +786,21 @@ static void si_emit_clip_regs(struct si_context *sctx)<br>
>                 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |<br>
>                 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |<br>
>                 clipdist_mask | (culldist_mask << 8));<br>
>         radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,<br>
>                 SI_TRACKED_PA_CL_CLIP_CNTL,<br>
>                 rs->pa_cl_clip_cntl |<br>
>                 ucp_mask |<br>
>                 S_028810_CLIP_DISABLE(window_space));<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  /*<br>
>   * inferred state between framebuffer and rasterizer<br>
>   */<br>
>  static void si_update_poly_offset_state(struct si_context *sctx)<br>
>  {<br>
>         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;<br>
>  <br>
>         if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {<br>
> @@ -1448,21 +1448,21 @@ static void si_emit_db_render_state(struct si_context *sctx)<br>
>                 db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;<br>
>  <br>
>         if (sctx->screen->has_rbplus &&<br>
>             !sctx->screen->rbplus_allowed)<br>
>                 db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);<br>
>  <br>
>         radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,<br>
>                                    SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  /*<br>
>   * format translation<br>
>   */<br>
>  static uint32_t si_translate_colorformat(enum pipe_format format)<br>
>  {<br>
>         const struct util_format_description *desc = util_format_description(format);<br>
>         if (!desc)<br>
>                 return V_028C70_COLOR_INVALID;<br>
> @@ -3537,21 +3537,21 @@ static void si_emit_msaa_config(struct si_context *sctx)<br>
>                                     SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,<br>
>                                     sc_aa_config);<br>
>         /* R_028804_DB_EQAA */<br>
>         radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,<br>
>                                    db_eqaa);<br>
>         /* R_028A4C_PA_SC_MODE_CNTL_1 */<br>
>         radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,<br>
>                                    SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);<br>
>  <br>
>         if (initial_cdw != cs->current.cdw) {<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  <br>
>                 /* GFX9: Flush DFSM when the AA mode changes. */<br>
>                 if (sctx->screen->dfsm_allowed) {<br>
>                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));<br>
>                         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));<br>
>                 }<br>
>         }<br>
>  }<br>
>  <br>
>  void si_update_ps_iter_samples(struct si_context *sctx)<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c<br>
> index 3516e561282..5c6c2e69b90 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_binning.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_binning.c<br>
> @@ -314,21 +314,21 @@ static void si_emit_dpbb_disable(struct si_context *sctx)<br>
>  <br>
>         radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,<br>
>                 SI_TRACKED_PA_SC_BINNER_CNTL_0,<br>
>                 S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |<br>
>                 S_028C44_DISABLE_START_OF_PRIM(1));<br>
>         radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,<br>
>                                    SI_TRACKED_DB_DFSM_CONTROL,<br>
>                                    S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |<br>
>                                    S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));<br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  void si_emit_dpbb_state(struct si_context *sctx)<br>
>  {<br>
>         struct si_screen *sscreen = sctx->screen;<br>
>         struct si_state_blend *blend = sctx->queued.named.blend;<br>
>         struct si_state_dsa *dsa = sctx->queued.named.dsa;<br>
>         unsigned db_shader_control = sctx->ps_db_shader_control;<br>
>  <br>
>         assert(sctx->chip_class >= GFX9);<br>
> @@ -436,12 +436,12 @@ void si_emit_dpbb_state(struct si_context *sctx)<br>
>                 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |<br>
>                 S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |<br>
>                 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |<br>
>                 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |<br>
>                 S_028C44_OPTIMAL_BIN_SELECTION(1));<br>
>         radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,<br>
>                                    SI_TRACKED_DB_DFSM_CONTROL,<br>
>                                    S_028060_PUNCHOUT_MODE(punchout_mode) |<br>
>                                    S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));<br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> index 2a514f144b9..8798f9ad0a0 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c<br>
> @@ -59,21 +59,21 @@ static unsigned si_conv_pipe_prim(unsigned mode)<br>
>         return prim_conv[mode];<br>
>  }<br>
>  <br>
>  /**<br>
>   * This calculates the LDS size for tessellation shaders (VS, TCS, TES).<br>
>   * LS.LDS_SIZE is shared by all 3 shader stages.<br>
>   *<br>
>   * The information about LDS and other non-compile-time parameters is then<br>
>   * written to userdata SGPRs.<br>
>   */<br>
> -static bool si_emit_derived_tess_state(struct si_context *sctx,<br>
> +static void si_emit_derived_tess_state(struct si_context *sctx,<br>
>                                        const struct pipe_draw_info *info,<br>
>                                        unsigned *num_patches)<br>
>  {<br>
>         struct radeon_cmdbuf *cs = sctx->gfx_cs;<br>
>         struct si_shader *ls_current;<br>
>         struct si_shader_selector *ls;<br>
>         /* The TES pointer will only be used for sctx->last_tcs.<br>
>          * It would be wrong to think that TCS = TES. */<br>
>         struct si_shader_selector *tcs =<br>
>                 sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;<br>
> @@ -103,21 +103,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,<br>
>                 ls = sctx->vs_shader.cso;<br>
>         }<br>
>  <br>
>         if (sctx->last_ls == ls_current &&<br>
>             sctx->last_tcs == tcs &&<br>
>             sctx->last_tes_sh_base == tes_sh_base &&<br>
>             sctx->last_num_tcs_input_cp == num_tcs_input_cp &&<br>
>             (!has_primid_instancing_bug ||<br>
>              (sctx->last_tess_uses_primid == tess_uses_primid))) {<br>
>                 *num_patches = sctx->last_num_patches;<br>
> -               return false;<br>
> +               return;<br>
>         }<br>
>  <br>
>         sctx->last_ls = ls_current;<br>
>         sctx->last_tcs = tcs;<br>
>         sctx->last_tes_sh_base = tes_sh_base;<br>
>         sctx->last_num_tcs_input_cp = num_tcs_input_cp;<br>
>         sctx->last_tess_uses_primid = tess_uses_primid;<br>
>  <br>
>         /* This calculates how shader inputs and outputs among VS, TCS, and TES<br>
>          * are laid out in LDS. */<br>
> @@ -298,23 +298,22 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,<br>
>  <br>
>         if (sctx->last_ls_hs_config != ls_hs_config) {<br>
>                 if (sctx->chip_class >= CIK) {<br>
>                         radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,<br>
>                                                    ls_hs_config);<br>
>                 } else {<br>
>                         radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,<br>
>                                                ls_hs_config);<br>
>                 }<br>
>                 sctx->last_ls_hs_config = ls_hs_config;<br>
> -               return true; /* true if the context rolls */<br>
> +               sctx->context_roll = true;<br>
>         }<br>
> -       return false;<br>
>  }<br>
>  <br>
>  static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)<br>
>  {<br>
>         switch (info->mode) {<br>
>         case PIPE_PRIM_PATCHES:<br>
>                 return info->count / info->vertices_per_patch;<br>
>         case PIPE_PRIM_POLYGON:<br>
>                 return info->count >= 3;<br>
>         case SI_PRIM_RECTANGLE_LIST:<br>
> @@ -534,44 +533,44 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,<br>
>                      (info->instance_count > 1 &&<br>
>                       (info->count_from_stream_output ||<br>
>                        si_num_prims_for_vertices(info) <= 1))))<br>
>                         sctx->flags |= SI_CONTEXT_VGT_FLUSH;<br>
>         }<br>
>  <br>
>         return ia_multi_vgt_param;<br>
>  }<br>
>  <br>
>  /* rast_prim is the primitive type after GS. */<br>
> -static bool si_emit_rasterizer_prim_state(struct si_context *sctx)<br>
> +static void si_emit_rasterizer_prim_state(struct si_context *sctx)<br>
>  {<br>
>         struct radeon_cmdbuf *cs = sctx->gfx_cs;<br>
>         enum pipe_prim_type rast_prim = sctx->current_rast_prim;<br>
>         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;<br>
>  <br>
>         /* Skip this if not rendering lines. */<br>
>         if (!util_prim_is_lines(rast_prim))<br>
> -               return false;<br>
> +               return;<br>
>  <br>
>         if (rast_prim == sctx->last_rast_prim &&<br>
>             rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)<br>
> -               return false;<br>
> +               return;<br>
>  <br>
>         /* For lines, reset the stipple pattern at each primitive. Otherwise,<br>
>          * reset the stipple pattern at each packet (line strips, line loops).<br>
>          */<br>
>         radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE,<br>
>                 rs->pa_sc_line_stipple |<br>
>                 S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2));<br>
>  <br>
>         sctx->last_rast_prim = rast_prim;<br>
>         sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;<br>
> -       return true; /* true if the context rolls */<br>
> +       sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  static void si_emit_vs_state(struct si_context *sctx,<br>
>                              const struct pipe_draw_info *info)<br>
>  {<br>
>         sctx->current_vs_state &= C_VS_STATE_INDEXED;<br>
>         sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);<br>
>  <br>
>         if (sctx->num_vs_blit_sgprs) {<br>
>                 /* Re-emit the state after we leave u_blitter. */<br>
> @@ -652,20 +651,21 @@ static void si_emit_draw_registers(struct si_context *sctx,<br>
>                         radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,<br>
>                                                info->primitive_restart);<br>
>  <br>
>                 sctx->last_primitive_restart_en = info->primitive_restart;<br>
>  <br>
>         }<br>
>         if (si_prim_restart_index_changed(sctx, info)) {<br>
>                 radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,<br>
>                                        info->restart_index);<br>
>                 sctx->last_restart_index = info->restart_index;<br>
> +               sctx->context_roll = true;<br>
>         }<br>
>  }<br>
>  <br>
>  static void si_emit_draw_packets(struct si_context *sctx,<br>
>                                  const struct pipe_draw_info *info,<br>
>                                  struct pipe_resource *indexbuf,<br>
>                                  unsigned index_size,<br>
>                                  unsigned index_offset)<br>
>  {<br>
>         struct pipe_draw_indirect_info *indirect = info->indirect;<br>
> @@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context *sctx,<br>
>                 radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */<br>
>                 radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */<br>
>         } else {<br>
>                 /* ACQUIRE_MEM is only required on a compute ring. */<br>
>                 radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));<br>
>                 radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */<br>
>                 radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */<br>
>                 radeon_emit(cs, 0);               /* CP_COHER_BASE */<br>
>                 radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */<br>
>         }<br>
> +<br>
> +       /* ACQUIRE_MEM has an implicit context roll if the current context<br>
> +        * is busy. */<br>
> +       if (sctx->has_graphics)<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  void si_emit_cache_flush(struct si_context *sctx)<br>
>  {<br>
>         struct radeon_cmdbuf *cs = sctx->gfx_cs;<br>
>         uint32_t flags = sctx->flags;<br>
>  <br>
>         if (!sctx->has_graphics) {<br>
>                 /* Only process compute flags. */<br>
>                 flags &= SI_CONTEXT_INV_ICACHE |<br>
> @@ -1216,40 +1221,24 @@ static void si_get_draw_start_count(struct si_context *sctx,<br>
>         } else {<br>
>                 *start = info->start;<br>
>                 *count = info->count;<br>
>         }<br>
>  }<br>
>  <br>
>  static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,<br>
>                                unsigned skip_atom_mask)<br>
>  {<br>
>         unsigned num_patches = 0;<br>
> -       /* Vega10/Raven scissor bug workaround. When any context register is<br>
> -        * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR<br>
> -        * registers must be written too.<br>
> -        */<br>
> -       bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&<br>
> -                                 !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);<br>
> -       bool context_roll = false; /* set correctly for GFX9 only */<br>
>  <br>
> -       context_roll |= si_emit_rasterizer_prim_state(sctx);<br>
> +       si_emit_rasterizer_prim_state(sctx);<br>
>         if (sctx->tes_shader.cso)<br>
> -               context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);<br>
> -<br>
> -       if (handle_scissor_bug &&<br>
> -           (info->count_from_stream_output ||<br>
> -            sctx->dirty_atoms & si_atoms_that_always_roll_context() ||<br>
> -            sctx->dirty_states & si_states_that_always_roll_context() ||<br>
> -            si_prim_restart_index_changed(sctx, info)))<br>
> -               context_roll = true;<br>
> -<br>
> -       sctx->context_roll_counter = 0;<br>
> +               si_emit_derived_tess_state(sctx, info, &num_patches);<br>
>  <br>
>         /* Emit state atoms. */<br>
>         unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;<br>
>         while (mask)<br>
>                 sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);<br>
>  <br>
>         sctx->dirty_atoms &= skip_atom_mask;<br>
>  <br>
>         /* Emit states. */<br>
>         mask = sctx->dirty_states;<br>
> @@ -1258,26 +1247,20 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i<br>
>                 struct si_pm4_state *state = sctx->queued.array[i];<br>
>  <br>
>                 if (!state || sctx->emitted.array[i] == state)<br>
>                         continue;<br>
>  <br>
>                 si_pm4_emit(sctx, state);<br>
>                 sctx->emitted.array[i] = state;<br>
>         }<br>
>         sctx->dirty_states = 0;<br>
>  <br>
> -       if (handle_scissor_bug &&<br>
> -           (context_roll || sctx->context_roll_counter)) {<br>
> -               sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;<br>
> -               sctx->atoms.s.scissors.emit(sctx);<br>
> -       }<br>
> -<br>
>         /* Emit draw states. */<br>
>         si_emit_vs_state(sctx, info);<br>
>         si_emit_draw_registers(sctx, info, num_patches);<br>
>  }<br>
>  <br>
>  static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)<br>
>  {<br>
>         struct si_context *sctx = (struct si_context *)ctx;<br>
>         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;<br>
>         struct pipe_resource *indexbuf = info->index.resource;<br>
> @@ -1462,45 +1445,66 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i<br>
>  <br>
>         si_need_gfx_cs_space(sctx);<br>
>  <br>
>         /* Since we've called si_context_add_resource_size for vertex buffers,<br>
>          * this must be called after si_need_cs_space, because we must let<br>
>          * need_cs_space flush before we add buffers to the buffer list.<br>
>          */<br>
>         if (!si_upload_vertex_buffer_descriptors(sctx))<br>
>                 goto return_cleanup;<br>
>  <br>
> +       /* Vega10/Raven scissor bug workaround. When any context register is<br>
> +        * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR<br>
> +        * registers must be written too.<br>
> +        */<br>
> +       bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug;<br>
> +       unsigned masked_atoms = 0;<br>
> +<br>
> +       if (has_gfx9_scissor_bug) {<br>
> +               masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);<br>
> +<br>
> +               if (info->count_from_stream_output ||<br>
> +                   sctx->dirty_atoms & si_atoms_that_always_roll_context() ||<br>
> +                   sctx->dirty_states & si_states_that_always_roll_context())<br>
> +                       sctx->context_roll = true;<br>
> +       }<br>
> +<br>
>         /* Use optimal packet order based on whether we need to sync the pipeline. */<br>
>         if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |<br>
>                                       SI_CONTEXT_FLUSH_AND_INV_DB |<br>
>                                       SI_CONTEXT_PS_PARTIAL_FLUSH |<br>
>                                       SI_CONTEXT_CS_PARTIAL_FLUSH))) {<br>
>                 /* If we have to wait for idle, set all states first, so that all<br>
>                  * SET packets are processed in parallel with previous draw calls.<br>
>                  * Then draw and prefetch at the end. This ensures that the time<br>
>                  * the CUs are idle is very short.<br>
>                  */<br>
> -               unsigned masked_atoms = 0;<br>
> -<br>
>                 if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))<br>
>                         masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);<br>
>  <br>
>                 if (!si_upload_graphics_shader_descriptors(sctx))<br>
>                         goto return_cleanup;<br>
>  <br>
>                 /* Emit all states except possibly render condition. */<br>
>                 si_emit_all_states(sctx, info, masked_atoms);<br>
>                 si_emit_cache_flush(sctx);<br>
>                 /* <-- CUs are idle here. */<br>
>  <br>
>                 if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))<br>
>                         sctx->atoms.s.render_cond.emit(sctx);<br>
> +<br>
> +               if (has_gfx9_scissor_bug &&<br>
> +                   (sctx->context_roll ||<br>
> +                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {<br>
> +                       sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;<br>
> +                       sctx->atoms.s.scissors.emit(sctx);<br>
> +               }<br>
>                 sctx->dirty_atoms = 0;<br>
>  <br>
>                 si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
>                 /* <-- CUs are busy here. */<br>
>  <br>
>                 /* Start prefetches after the draw has been started. Both will run<br>
>                  * in parallel, but starting the draw first is more important.<br>
>                  */<br>
>                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)<br>
>                         cik_emit_prefetch_L2(sctx, false);<br>
> @@ -1511,29 +1515,41 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i<br>
>                 if (sctx->flags)<br>
>                         si_emit_cache_flush(sctx);<br>
>  <br>
>                 /* Only prefetch the API VS and VBO descriptors. */<br>
>                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)<br>
>                         cik_emit_prefetch_L2(sctx, true);<br>
>  <br>
>                 if (!si_upload_graphics_shader_descriptors(sctx))<br>
>                         return;<br>
>  <br>
> -               si_emit_all_states(sctx, info, 0);<br>
> +               si_emit_all_states(sctx, info, masked_atoms);<br>
> +<br>
> +               if (has_gfx9_scissor_bug &&<br>
> +                   (sctx->context_roll ||<br>
> +                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {<br>
> +                       sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;<br>
> +                       sctx->atoms.s.scissors.emit(sctx);<br>
> +               }<br>
> +               sctx->dirty_atoms = 0;<br>
> +<br>
>                 si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);<br>
>  <br>
>                 /* Prefetch the remaining shaders after the draw has been<br>
>                  * started. */<br>
>                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)<br>
>                         cik_emit_prefetch_L2(sctx, false);<br>
>         }<br>
>  <br>
> +       /* Clear the context roll flag after the draw call. */<br>
> +       sctx->context_roll = false;<br>
> +<br>
>         if (unlikely(sctx->current_saved_cs)) {<br>
>                 si_trace_emit(sctx);<br>
>                 si_log_draw_state(sctx, sctx->log);<br>
>         }<br>
>  <br>
>         /* Workaround for a VGT hang when streamout is enabled.<br>
>          * It must be done after drawing. */<br>
>         if ((sctx->family == CHIP_HAWAII ||<br>
>              sctx->family == CHIP_TONGA ||<br>
>              sctx->family == CHIP_FIJI) &&<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c<br>
> index 5bdfd4f6ac1..d00bb170981 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c<br>
> @@ -569,21 +569,21 @@ static void si_emit_shader_es(struct si_context *sctx)<br>
>                 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,<br>
>                                            SI_TRACKED_VGT_TF_PARAM,<br>
>                                            shader->vgt_tf_param);<br>
>  <br>
>         if (shader->vgt_vertex_reuse_block_cntl)<br>
>                 radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                            SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                            shader->vgt_vertex_reuse_block_cntl);<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)<br>
>  {<br>
>         struct si_pm4_state *pm4;<br>
>         unsigned num_user_sgprs;<br>
>         unsigned vgpr_comp_cnt;<br>
>         uint64_t va;<br>
>         unsigned oc_lds_en;<br>
>  <br>
> @@ -818,21 +818,21 @@ static void si_emit_shader_gs(struct si_context *sctx)<br>
>                         radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,<br>
>                                                    SI_TRACKED_VGT_TF_PARAM,<br>
>                                                    shader->vgt_tf_param);<br>
>                 if (shader->vgt_vertex_reuse_block_cntl)<br>
>                         radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                                    SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                                    shader->vgt_vertex_reuse_block_cntl);<br>
>         }<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)<br>
>  {<br>
>         struct si_shader_selector *sel = shader->selector;<br>
>         const ubyte *num_components = sel->info.num_stream_output_components;<br>
>         unsigned gs_num_invocations = sel->gs_num_invocations;<br>
>         struct si_pm4_state *pm4;<br>
>         uint64_t va;<br>
>         unsigned max_stream = sel->max_gs_stream;<br>
> @@ -995,21 +995,21 @@ static void si_emit_shader_vs(struct si_context *sctx)<br>
>                 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,<br>
>                                            SI_TRACKED_VGT_TF_PARAM,<br>
>                                            shader->vgt_tf_param);<br>
>  <br>
>         if (shader->vgt_vertex_reuse_block_cntl)<br>
>                 radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                            SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,<br>
>                                            shader->vgt_vertex_reuse_block_cntl);<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  /**<br>
>   * Compute the state for \p shader, which will run as a vertex shader on the<br>
>   * hardware.<br>
>   *<br>
>   * If \p gs is non-NULL, it points to the geometry shader for which this shader<br>
>   * is the copy shader.<br>
>   */<br>
>  static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,<br>
> @@ -1187,21 +1187,21 @@ static void si_emit_shader_ps(struct si_context *sctx)<br>
>         radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,<br>
>                                     SI_TRACKED_SPI_SHADER_Z_FORMAT,<br>
>                                     shader->ctx_reg.ps.spi_shader_z_format,<br>
>                                     shader->ctx_reg.ps.spi_shader_col_format);<br>
>  <br>
>         radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,<br>
>                                    SI_TRACKED_CB_SHADER_MASK,<br>
>                                    shader->ctx_reg.ps.cb_shader_mask);<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  static void si_shader_ps(struct si_shader *shader)<br>
>  {<br>
>         struct tgsi_shader_info *info = &shader->selector->info;<br>
>         struct si_pm4_state *pm4;<br>
>         unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;<br>
>         unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);<br>
>         uint64_t va;<br>
>         unsigned input_ena = shader->config.spi_ps_input_ena;<br>
> @@ -2863,21 +2863,21 @@ static void si_emit_spi_map(struct si_context *sctx)<br>
>  <br>
>         /* R_028644_SPI_PS_INPUT_CNTL_0 */<br>
>         /* Dota 2: Only ~16% of SPI map updates set different values. */<br>
>         /* Talos: Only ~9% of SPI map updates set different values. */<br>
>         unsigned initial_cdw = sctx->gfx_cs->current.cdw;<br>
>         radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,<br>
>                                     spi_ps_input_cntl,<br>
>                                     sctx->tracked_regs.spi_ps_input_cntl, num_interp);<br>
>  <br>
>         if (initial_cdw != sctx->gfx_cs->current.cdw)<br>
> -               sctx->context_roll_counter++;<br>
> +               sctx->context_roll = true;<br>
>  }<br>
>  <br>
>  /**<br>
>   * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.<br>
>   */<br>
>  static void si_init_config_add_vgt_flush(struct si_context *sctx)<br>
>  {<br>
>         if (sctx->init_config_has_vgt_flush)<br>
>                 return;<br>
>  <br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c<br>
> index 2bf6862c89b..2a0a4bef9a2 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_streamout.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c<br>
> @@ -296,20 +296,21 @@ void si_emit_streamout_end(struct si_context *sctx)<br>
>                 radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,<br>
>                                           t[i]->buf_filled_size,<br>
>                                           RADEON_USAGE_WRITE,<br>
>                                           RADEON_PRIO_SO_FILLED_SIZE);<br>
>  <br>
>                 /* Zero the buffer size. The counters (primitives generated,<br>
>                  * primitives emitted) may be enabled even if there is not<br>
>                  * buffer bound. This ensures that the primitives-emitted query<br>
>                  * won't increment. */<br>
>                 radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);<br>
> +               sctx->context_roll = true;<br>
>  <br>
>                 t[i]->buf_filled_size_valid = true;<br>
>         }<br>
>  <br>
>         sctx->streamout.begin_emitted = false;<br>
>  }<br>
>  <br>
>  /* STREAMOUT CONFIG DERIVED STATE<br>
>   *<br>
>   * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.<br>
> diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c<br>
> index f988da4520b..6f348a9b58d 100644<br>
> --- a/src/gallium/drivers/radeonsi/si_state_viewport.c<br>
> +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c<br>
> @@ -276,21 +276,21 @@ static void si_emit_guardband(struct si_context *ctx)<br>
>         radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,<br>
>                                    SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,<br>
>                                    S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |<br>
>                                    S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));<br>
>         radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,<br>
>                                    SI_TRACKED_PA_SU_VTX_CNTL,<br>
>                                    S_028BE4_PIX_CENTER(rs->half_pixel_center) |<br>
>                                    S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +<br>
>                                                        vp_as_scissor.quant_mode));<br>
>         if (initial_cdw != ctx->gfx_cs->current.cdw)<br>
> -               ctx->context_roll_counter++;<br>
> +               ctx->context_roll = true;<br>
>  }<br>
>  <br>
>  static void si_emit_scissors(struct si_context *ctx)<br>
>  {<br>
>         struct radeon_cmdbuf *cs = ctx->gfx_cs;<br>
>         struct pipe_scissor_state *states = ctx->scissors.states;<br>
>         unsigned mask = ctx->scissors.dirty_mask;<br>
>         bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;<br>
>  <br>
>         /* The simple case: Only 1 viewport is active. */<br>
> -- <br>
> 2.17.1<br>
> <br>
> _______________________________________________<br>
> mesa-dev mailing list<br>
> <a href="mailto:mesa-dev@lists.freedesktop.org" target="_blank" rel="noreferrer">mesa-dev@lists.freedesktop.org</a><br>
> <a href="https://lists.freedesktop.org/mailman/listinfo/mesa-dev" rel="noreferrer noreferrer" target="_blank">https://lists.freedesktop.org/mailman/listinfo/mesa-dev</a><br>
</blockquote></div>