[Mesa-dev] [PATCH 2/3] radeonsi/gfx9: rework the gfx9 scissor bug workaround (v2)

Thu Apr 25 22:38:27 UTC 2019

Awesome, thanks.

Quoting Marek Olšák (2019-04-25 14:50:52)
> Thanks. It looks good.
> 
> Marek
> 
> On Thu, Apr 25, 2019, 5:17 PM Dylan Baker <dylan at pnwbakers.com> wrote:
> 
>     Hi Marek,
> 
>     I've tried to apply this to 19.0, I had to pull "radeonsi: add
>     si_debug_options
>     for convenient adding/removing of options", which is fine, but this patch
>     also
>     assumes your si compute-queue only patches, which aren't present in 19.0.
>     I've
>     made a small change to get it compiling, but I'm sure it's not the right
>     fix, so
>     if you could take a look at the staging/19.0 branch and let me know what
>     you'd
>     like to do I'd appreciate it.
> 
>     Thanks,
>     Dylan
> 
>     Quoting Marek Olšák (2019-04-18 14:46:27)
>     > From: Marek Olšák <marek.olsak at amd.com>
>     >
>     > Needed to track context rolls caused by streamout and ACQUIRE_MEM.
>     > ACQUIRE_MEM can occur outside of draw calls.
>     >
>     > Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355
>     >
>     > v2: squashed patches and done more rework
>     >
>     > Cc: 19.0 <mesa-stable at lists.freedesktop.org>
>     > ---
>     >  src/gallium/drivers/radeonsi/si_pipe.c        |  2 +
>     >  src/gallium/drivers/radeonsi/si_pipe.h        |  3 +-
>     >  src/gallium/drivers/radeonsi/si_state.c       |  8 +-
>     >  .../drivers/radeonsi/si_state_binning.c       |  4 +-
>     >  src/gallium/drivers/radeonsi/si_state_draw.c  | 86 +++++++++++--------
>     >  .../drivers/radeonsi/si_state_shaders.c       | 10 +--
>     >  .../drivers/radeonsi/si_state_streamout.c     |  1 +
>     >  .../drivers/radeonsi/si_state_viewport.c      |  2 +-
>     >  8 files changed, 68 insertions(+), 48 deletions(-)
>     >
>     > diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers
>     /radeonsi/si_pipe.c
>     > index fa96ce34224..7209db9fb37 100644
>     > --- a/src/gallium/drivers/radeonsi/si_pipe.c
>     > +++ b/src/gallium/drivers/radeonsi/si_pipe.c
>     > @@ -1072,20 +1072,22 @@ struct pipe_screen *radeonsi_screen_create(struct
>     radeon_winsys *ws,
>     > 
>     >         sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI &
>     &
>     >                                          sscreen->info.max_se >= 2 &&
>     >                                          !(sscreen->debug_flags & DBG
>     (NO_OUT_OF_ORDER));
>     >         sscreen->assume_no_z_fights =
>     >                 driQueryOptionb(config->options,
>     "radeonsi_assume_no_z_fights");
>     >         sscreen->commutative_blend_add =
>     >                 driQueryOptionb(config->options,
>     "radeonsi_commutative_blend_add");
>     >         sscreen->clear_db_cache_before_clear =
>     >                 driQueryOptionb(config->options,
>     "radeonsi_clear_db_cache_before_clear");
>     > +       sscreen->has_gfx9_scissor_bug = sscreen->info.family ==
>     CHIP_VEGA10 ||
>     > +                                       sscreen->info.family ==
>     CHIP_RAVEN;
>     >         sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >=
>     CHIP_POLARIS10 &&
>     >                                             sscreen->info.family <=
>     CHIP_POLARIS12) ||
>     >                                            sscreen->info.family ==
>     CHIP_VEGA10 ||
>     >                                            sscreen->info.family ==
>     CHIP_RAVEN;
>     >         sscreen->has_ls_vgpr_init_bug = sscreen->info.family ==
>     CHIP_VEGA10 ||
>     >                                         sscreen->info.family ==
>     CHIP_RAVEN;
>     >         sscreen->has_dcc_constant_encode = sscreen->info.family ==
>     CHIP_RAVEN2;
>     > 
>     >         /* Only enable primitive binning on APUs by default. */
>     >         sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN ||
>     > diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers
>     /radeonsi/si_pipe.h
>     > index aaa95f32d20..a4c90a4f69f 100644
>     > --- a/src/gallium/drivers/radeonsi/si_pipe.h
>     > +++ b/src/gallium/drivers/radeonsi/si_pipe.h
>     > @@ -463,20 +463,21 @@ struct si_screen {
>     >         unsigned                        eqaa_force_coverage_samples;
>     >         unsigned                        eqaa_force_z_samples;
>     >         unsigned                        eqaa_force_color_samples;
>     >         bool                            has_clear_state;
>     >         bool                            has_distributed_tess;
>     >         bool                            has_draw_indirect_multi;
>     >         bool                            has_out_of_order_rast;
>     >         bool                            assume_no_z_fights;
>     >         bool                            commutative_blend_add;
>     >         bool                            clear_db_cache_before_clear;
>     > +       bool                            has_gfx9_scissor_bug;
>     >         bool                            has_msaa_sample_loc_bug;
>     >         bool                            has_ls_vgpr_init_bug;
>     >         bool                            has_dcc_constant_encode;
>     >         bool                            dpbb_allowed;
>     >         bool                            dfsm_allowed;
>     >         bool                            llvm_has_working_vgpr_indexing;
>     > 
>     >         /* Whether shaders are monolithic (1-part) or separate (3-part).
>     */
>     >         bool                            use_monolithic_shaders;
>     >         bool                            record_llvm_ir;
>     > @@ -1062,21 +1063,21 @@ struct si_context {
>     >         unsigned                        num_vs_flushes;
>     >         unsigned                        num_ps_flushes;
>     >         unsigned                        num_cs_flushes;
>     >         unsigned                        num_cb_cache_flushes;
>     >         unsigned                        num_db_cache_flushes;
>     >         unsigned                        num_L2_invalidates;
>     >         unsigned                        num_L2_writebacks;
>     >         unsigned                        num_resident_handles;
>     >         uint64_t                        num_alloc_tex_transfer_bytes;
>     >         unsigned                        last_tex_ps_draw_ratio; /* for
>     query */
>     > -       unsigned                        context_roll_counter;
>     > +       unsigned                        context_roll;
>     > 
>     >         /* Queries. */
>     >         /* Maintain the list of active queries for pausing between IBs. *
>     /
>     >         int                             num_occlusion_queries;
>     >         int                             num_perfect_occlusion_queries;
>     >         struct list_head                active_queries;
>     >         unsigned                        num_cs_dw_queries_suspend;
>     > 
>     >         /* Render condition. */
>     >         struct pipe_query               *render_cond;
>     > diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/
>     drivers/radeonsi/si_state.c
>     > index 757c17f7df8..bc7e777ad73 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state.c
>     > @@ -249,21 +249,21 @@ static void si_emit_cb_render_state(struct
>     si_context *sctx)
>     >                         }
>     >                 }
>     > 
>     >                 /* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON,
>     SX_BLEND_OPT_CONTROL */
>     >                 radeon_opt_set_context_reg3(sctx,
>     R_028754_SX_PS_DOWNCONVERT,
>     >                                             SI_TRACKED_SX_PS_DOWNCONVERT,
>     >                                             sx_ps_downconvert,
>     sx_blend_opt_epsilon,
>     >                                             sx_blend_opt_control);
>     >         }
>     >         if (initial_cdw != cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  /*
>     >   * Blender functions
>     >   */
>     > 
>     >  static uint32_t si_translate_blend_function(int blend_func)
>     >  {
>     >         switch (blend_func) {
>     >         case PIPE_BLEND_ADD:
>     > @@ -786,21 +786,21 @@ static void si_emit_clip_regs(struct si_context
>     *sctx)
>     >                 S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0)
>     |
>     >                 S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0)
>     |
>     >                 clipdist_mask | (culldist_mask << 8));
>     >         radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
>     >                 SI_TRACKED_PA_CL_CLIP_CNTL,
>     >                 rs->pa_cl_clip_cntl |
>     >                 ucp_mask |
>     >                 S_028810_CLIP_DISABLE(window_space));
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  /*
>     >   * inferred state between framebuffer and rasterizer
>     >   */
>     >  static void si_update_poly_offset_state(struct si_context *sctx)
>     >  {
>     >         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>     > 
>     >         if (!rs || !rs->uses_poly_offset || !sctx->
>     framebuffer.state.zsbuf) {
>     > @@ -1448,21 +1448,21 @@ static void si_emit_db_render_state(struct
>     si_context *sctx)
>     >                 db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
>     > 
>     >         if (sctx->screen->has_rbplus &&
>     >             !sctx->screen->rbplus_allowed)
>     >                 db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
>     > 
>     >         radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
>     >                                    SI_TRACKED_DB_SHADER_CONTROL,
>     db_shader_control);
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  /*
>     >   * format translation
>     >   */
>     >  static uint32_t si_translate_colorformat(enum pipe_format format)
>     >  {
>     >         const struct util_format_description *desc =
>     util_format_description(format);
>     >         if (!desc)
>     >                 return V_028C70_COLOR_INVALID;
>     > @@ -3537,21 +3537,21 @@ static void si_emit_msaa_config(struct si_context
>     *sctx)
>     >                                     SI_TRACKED_PA_SC_LINE_CNTL,
>     sc_line_cntl,
>     >                                     sc_aa_config);
>     >         /* R_028804_DB_EQAA */
>     >         radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA,
>     SI_TRACKED_DB_EQAA,
>     >                                    db_eqaa);
>     >         /* R_028A4C_PA_SC_MODE_CNTL_1 */
>     >         radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
>     >                                    SI_TRACKED_PA_SC_MODE_CNTL_1,
>     sc_mode_cntl_1);
>     > 
>     >         if (initial_cdw != cs->current.cdw) {
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     > 
>     >                 /* GFX9: Flush DFSM when the AA mode changes. */
>     >                 if (sctx->screen->dfsm_allowed) {
>     >                         radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
>     >                         radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) |
>     EVENT_INDEX(0));
>     >                 }
>     >         }
>     >  }
>     > 
>     >  void si_update_ps_iter_samples(struct si_context *sctx)
>     > diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/
>     gallium/drivers/radeonsi/si_state_binning.c
>     > index 3516e561282..5c6c2e69b90 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state_binning.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state_binning.c
>     > @@ -314,21 +314,21 @@ static void si_emit_dpbb_disable(struct si_context
>     *sctx)
>     > 
>     >         radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
>     >                 SI_TRACKED_PA_SC_BINNER_CNTL_0,
>     >                 S_028C44_BINNING_MODE
>     (V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
>     >                 S_028C44_DISABLE_START_OF_PRIM(1));
>     >         radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
>     >                                    SI_TRACKED_DB_DFSM_CONTROL,
>     >                                    S_028060_PUNCHOUT_MODE
>     (V_028060_FORCE_OFF) |
>     >                                    S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  void si_emit_dpbb_state(struct si_context *sctx)
>     >  {
>     >         struct si_screen *sscreen = sctx->screen;
>     >         struct si_state_blend *blend = sctx->queued.named.blend;
>     >         struct si_state_dsa *dsa = sctx->queued.named.dsa;
>     >         unsigned db_shader_control = sctx->ps_db_shader_control;
>     > 
>     >         assert(sctx->chip_class >= GFX9);
>     > @@ -436,12 +436,12 @@ void si_emit_dpbb_state(struct si_context *sctx)
>     >                 S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
>     >                 S_028C44_PERSISTENT_STATES_PER_BIN
>     (persistent_states_per_bin) |
>     >                 S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
>     >                 S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
>     >                 S_028C44_OPTIMAL_BIN_SELECTION(1));
>     >         radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
>     >                                    SI_TRACKED_DB_DFSM_CONTROL,
>     >                                    S_028060_PUNCHOUT_MODE(punchout_mode)
>     |
>     >                                    S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/
>     drivers/radeonsi/si_state_draw.c
>     > index 2a514f144b9..8798f9ad0a0 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state_draw.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
>     > @@ -59,21 +59,21 @@ static unsigned si_conv_pipe_prim(unsigned mode)
>     >         return prim_conv[mode];
>     >  }
>     > 
>     >  /**
>     >   * This calculates the LDS size for tessellation shaders (VS, TCS, TES).
>     >   * LS.LDS_SIZE is shared by all 3 shader stages.
>     >   *
>     >   * The information about LDS and other non-compile-time parameters is
>     then
>     >   * written to userdata SGPRs.
>     >   */
>     > -static bool si_emit_derived_tess_state(struct si_context *sctx,
>     > +static void si_emit_derived_tess_state(struct si_context *sctx,
>     >                                        const struct pipe_draw_info *info,
>     >                                        unsigned *num_patches)
>     >  {
>     >         struct radeon_cmdbuf *cs = sctx->gfx_cs;
>     >         struct si_shader *ls_current;
>     >         struct si_shader_selector *ls;
>     >         /* The TES pointer will only be used for sctx->last_tcs.
>     >          * It would be wrong to think that TCS = TES. */
>     >         struct si_shader_selector *tcs =
>     >                 sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->
>     tes_shader.cso;
>     > @@ -103,21 +103,21 @@ static bool si_emit_derived_tess_state(struct
>     si_context *sctx,
>     >                 ls = sctx->vs_shader.cso;
>     >         }
>     > 
>     >         if (sctx->last_ls == ls_current &&
>     >             sctx->last_tcs == tcs &&
>     >             sctx->last_tes_sh_base == tes_sh_base &&
>     >             sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
>     >             (!has_primid_instancing_bug ||
>     >              (sctx->last_tess_uses_primid == tess_uses_primid))) {
>     >                 *num_patches = sctx->last_num_patches;
>     > -               return false;
>     > +               return;
>     >         }
>     > 
>     >         sctx->last_ls = ls_current;
>     >         sctx->last_tcs = tcs;
>     >         sctx->last_tes_sh_base = tes_sh_base;
>     >         sctx->last_num_tcs_input_cp = num_tcs_input_cp;
>     >         sctx->last_tess_uses_primid = tess_uses_primid;
>     > 
>     >         /* This calculates how shader inputs and outputs among VS, TCS,
>     and TES
>     >          * are laid out in LDS. */
>     > @@ -298,23 +298,22 @@ static bool si_emit_derived_tess_state(struct
>     si_context *sctx,
>     > 
>     >         if (sctx->last_ls_hs_config != ls_hs_config) {
>     >                 if (sctx->chip_class >= CIK) {
>     >                         radeon_set_context_reg_idx(cs,
>     R_028B58_VGT_LS_HS_CONFIG, 2,
>     >                                                    ls_hs_config);
>     >                 } else {
>     >                         radeon_set_context_reg(cs,
>     R_028B58_VGT_LS_HS_CONFIG,
>     >                                                ls_hs_config);
>     >                 }
>     >                 sctx->last_ls_hs_config = ls_hs_config;
>     > -               return true; /* true if the context rolls */
>     > +               sctx->context_roll = true;
>     >         }
>     > -       return false;
>     >  }
>     > 
>     >  static unsigned si_num_prims_for_vertices(const struct pipe_draw_info
>     *info)
>     >  {
>     >         switch (info->mode) {
>     >         case PIPE_PRIM_PATCHES:
>     >                 return info->count / info->vertices_per_patch;
>     >         case PIPE_PRIM_POLYGON:
>     >                 return info->count >= 3;
>     >         case SI_PRIM_RECTANGLE_LIST:
>     > @@ -534,44 +533,44 @@ static unsigned si_get_ia_multi_vgt_param(struct
>     si_context *sctx,
>     >                      (info->instance_count > 1 &&
>     >                       (info->count_from_stream_output ||
>     >                        si_num_prims_for_vertices(info) <= 1))))
>     >                         sctx->flags |= SI_CONTEXT_VGT_FLUSH;
>     >         }
>     > 
>     >         return ia_multi_vgt_param;
>     >  }
>     > 
>     >  /* rast_prim is the primitive type after GS. */
>     > -static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
>     > +static void si_emit_rasterizer_prim_state(struct si_context *sctx)
>     >  {
>     >         struct radeon_cmdbuf *cs = sctx->gfx_cs;
>     >         enum pipe_prim_type rast_prim = sctx->current_rast_prim;
>     >         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>     > 
>     >         /* Skip this if not rendering lines. */
>     >         if (!util_prim_is_lines(rast_prim))
>     > -               return false;
>     > +               return;
>     > 
>     >         if (rast_prim == sctx->last_rast_prim &&
>     >             rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
>     > -               return false;
>     > +               return;
>     > 
>     >         /* For lines, reset the stipple pattern at each primitive.
>     Otherwise,
>     >          * reset the stipple pattern at each packet (line strips, line
>     loops).
>     >          */
>     >         radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE,
>     >                 rs->pa_sc_line_stipple |
>     >                 S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1
>     : 2));
>     > 
>     >         sctx->last_rast_prim = rast_prim;
>     >         sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
>     > -       return true; /* true if the context rolls */
>     > +       sctx->context_roll = true;
>     >  }
>     > 
>     >  static void si_emit_vs_state(struct si_context *sctx,
>     >                              const struct pipe_draw_info *info)
>     >  {
>     >         sctx->current_vs_state &= C_VS_STATE_INDEXED;
>     >         sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
>     > 
>     >         if (sctx->num_vs_blit_sgprs) {
>     >                 /* Re-emit the state after we leave u_blitter. */
>     > @@ -652,20 +651,21 @@ static void si_emit_draw_registers(struct
>     si_context *sctx,
>     >                         radeon_set_context_reg(cs,
>     R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
>     >                                                info->primitive_restart);
>     > 
>     >                 sctx->last_primitive_restart_en = info->
>     primitive_restart;
>     > 
>     >         }
>     >         if (si_prim_restart_index_changed(sctx, info)) {
>     >                 radeon_set_context_reg(cs,
>     R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
>     >                                        info->restart_index);
>     >                 sctx->last_restart_index = info->restart_index;
>     > +               sctx->context_roll = true;
>     >         }
>     >  }
>     > 
>     >  static void si_emit_draw_packets(struct si_context *sctx,
>     >                                  const struct pipe_draw_info *info,
>     >                                  struct pipe_resource *indexbuf,
>     >                                  unsigned index_size,
>     >                                  unsigned index_offset)
>     >  {
>     >         struct pipe_draw_indirect_info *indirect = info->indirect;
>     > @@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context
>     *sctx,
>     >                 radeon_emit(cs, 0);             /* CP_COHER_BASE_HI */
>     >                 radeon_emit(cs, 0x0000000A);    /* POLL_INTERVAL */
>     >         } else {
>     >                 /* ACQUIRE_MEM is only required on a compute ring. */
>     >                 radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
>     >                 radeon_emit(cs, cp_coher_cntl);   /* CP_COHER_CNTL */
>     >                 radeon_emit(cs, 0xffffffff);      /* CP_COHER_SIZE */
>     >                 radeon_emit(cs, 0);               /* CP_COHER_BASE */
>     >                 radeon_emit(cs, 0x0000000A);      /* POLL_INTERVAL */
>     >         }
>     > +
>     > +       /* ACQUIRE_MEM has an implicit context roll if the current
>     context
>     > +        * is busy. */
>     > +       if (sctx->has_graphics)
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  void si_emit_cache_flush(struct si_context *sctx)
>     >  {
>     >         struct radeon_cmdbuf *cs = sctx->gfx_cs;
>     >         uint32_t flags = sctx->flags;
>     > 
>     >         if (!sctx->has_graphics) {
>     >                 /* Only process compute flags. */
>     >                 flags &= SI_CONTEXT_INV_ICACHE |
>     > @@ -1216,40 +1221,24 @@ static void si_get_draw_start_count(struct
>     si_context *sctx,
>     >         } else {
>     >                 *start = info->start;
>     >                 *count = info->count;
>     >         }
>     >  }
>     > 
>     >  static void si_emit_all_states(struct si_context *sctx, const struct
>     pipe_draw_info *info,
>     >                                unsigned skip_atom_mask)
>     >  {
>     >         unsigned num_patches = 0;
>     > -       /* Vega10/Raven scissor bug workaround. When any context register
>     is
>     > -        * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
>     > -        * registers must be written too.
>     > -        */
>     > -       bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->
>     family == CHIP_RAVEN) &&
>     > -                                 !si_is_atom_dirty(sctx, &sctx->
>     atoms.s.scissors);
>     > -       bool context_roll = false; /* set correctly for GFX9 only */
>     > 
>     > -       context_roll |= si_emit_rasterizer_prim_state(sctx);
>     > +       si_emit_rasterizer_prim_state(sctx);
>     >         if (sctx->tes_shader.cso)
>     > -               context_roll |= si_emit_derived_tess_state(sctx, info, &
>     num_patches);
>     > -
>     > -       if (handle_scissor_bug &&
>     > -           (info->count_from_stream_output ||
>     > -            sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
>     > -            sctx->dirty_states & si_states_that_always_roll_context() ||
>     > -            si_prim_restart_index_changed(sctx, info)))
>     > -               context_roll = true;
>     > -
>     > -       sctx->context_roll_counter = 0;
>     > +               si_emit_derived_tess_state(sctx, info, &num_patches);
>     > 
>     >         /* Emit state atoms. */
>     >         unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
>     >         while (mask)
>     >                 sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
>     > 
>     >         sctx->dirty_atoms &= skip_atom_mask;
>     > 
>     >         /* Emit states. */
>     >         mask = sctx->dirty_states;
>     > @@ -1258,26 +1247,20 @@ static void si_emit_all_states(struct si_context
>     *sctx, const struct pipe_draw_i
>     >                 struct si_pm4_state *state = sctx->queued.array[i];
>     > 
>     >                 if (!state || sctx->emitted.array[i] == state)
>     >                         continue;
>     > 
>     >                 si_pm4_emit(sctx, state);
>     >                 sctx->emitted.array[i] = state;
>     >         }
>     >         sctx->dirty_states = 0;
>     > 
>     > -       if (handle_scissor_bug &&
>     > -           (context_roll || sctx->context_roll_counter)) {
>     > -               sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
>     > -               sctx->atoms.s.scissors.emit(sctx);
>     > -       }
>     > -
>     >         /* Emit draw states. */
>     >         si_emit_vs_state(sctx, info);
>     >         si_emit_draw_registers(sctx, info, num_patches);
>     >  }
>     > 
>     >  static void si_draw_vbo(struct pipe_context *ctx, const struct
>     pipe_draw_info *info)
>     >  {
>     >         struct si_context *sctx = (struct si_context *)ctx;
>     >         struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
>     >         struct pipe_resource *indexbuf = info->index.resource;
>     > @@ -1462,45 +1445,66 @@ static void si_draw_vbo(struct pipe_context *ctx,
>     const struct pipe_draw_info *i
>     > 
>     >         si_need_gfx_cs_space(sctx);
>     > 
>     >         /* Since we've called si_context_add_resource_size for vertex
>     buffers,
>     >          * this must be called after si_need_cs_space, because we must
>     let
>     >          * need_cs_space flush before we add buffers to the buffer list.
>     >          */
>     >         if (!si_upload_vertex_buffer_descriptors(sctx))
>     >                 goto return_cleanup;
>     > 
>     > +       /* Vega10/Raven scissor bug workaround. When any context register
>     is
>     > +        * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
>     > +        * registers must be written too.
>     > +        */
>     > +       bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug;
>     > +       unsigned masked_atoms = 0;
>     > +
>     > +       if (has_gfx9_scissor_bug) {
>     > +               masked_atoms |= si_get_atom_bit(sctx, &sctx->
>     atoms.s.scissors);
>     > +
>     > +               if (info->count_from_stream_output ||
>     > +                   sctx->dirty_atoms & si_atoms_that_always_roll_context
>     () ||
>     > +                   sctx->dirty_states &
>     si_states_that_always_roll_context())
>     > +                       sctx->context_roll = true;
>     > +       }
>     > +
>     >         /* Use optimal packet order based on whether we need to sync the
>     pipeline. */
>     >         if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
>     >                                       SI_CONTEXT_FLUSH_AND_INV_DB |
>     >                                       SI_CONTEXT_PS_PARTIAL_FLUSH |
>     >                                       SI_CONTEXT_CS_PARTIAL_FLUSH))) {
>     >                 /* If we have to wait for idle, set all states first, so
>     that all
>     >                  * SET packets are processed in parallel with previous
>     draw calls.
>     >                  * Then draw and prefetch at the end. This ensures that
>     the time
>     >                  * the CUs are idle is very short.
>     >                  */
>     > -               unsigned masked_atoms = 0;
>     > -
>     >                 if (unlikely(sctx->flags &
>     SI_CONTEXT_FLUSH_FOR_RENDER_COND))
>     >                         masked_atoms |= si_get_atom_bit(sctx, &sctx->
>     atoms.s.render_cond);
>     > 
>     >                 if (!si_upload_graphics_shader_descriptors(sctx))
>     >                         goto return_cleanup;
>     > 
>     >                 /* Emit all states except possibly render condition. */
>     >                 si_emit_all_states(sctx, info, masked_atoms);
>     >                 si_emit_cache_flush(sctx);
>     >                 /* <-- CUs are idle here. */
>     > 
>     >                 if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
>     >                         sctx->atoms.s.render_cond.emit(sctx);
>     > +
>     > +               if (has_gfx9_scissor_bug &&
>     > +                   (sctx->context_roll ||
>     > +                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
>     > +                       sctx->scissors.dirty_mask = (1 <<
>     SI_MAX_VIEWPORTS) - 1;
>     > +                       sctx->atoms.s.scissors.emit(sctx);
>     > +               }
>     >                 sctx->dirty_atoms = 0;
>     > 
>     >                 si_emit_draw_packets(sctx, info, indexbuf, index_size,
>     index_offset);
>     >                 /* <-- CUs are busy here. */
>     > 
>     >                 /* Start prefetches after the draw has been started. Both
>     will run
>     >                  * in parallel, but starting the draw first is more
>     important.
>     >                  */
>     >                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
>     >                         cik_emit_prefetch_L2(sctx, false);
>     > @@ -1511,29 +1515,41 @@ static void si_draw_vbo(struct pipe_context *ctx,
>     const struct pipe_draw_info *i
>     >                 if (sctx->flags)
>     >                         si_emit_cache_flush(sctx);
>     > 
>     >                 /* Only prefetch the API VS and VBO descriptors. */
>     >                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
>     >                         cik_emit_prefetch_L2(sctx, true);
>     > 
>     >                 if (!si_upload_graphics_shader_descriptors(sctx))
>     >                         return;
>     > 
>     > -               si_emit_all_states(sctx, info, 0);
>     > +               si_emit_all_states(sctx, info, masked_atoms);
>     > +
>     > +               if (has_gfx9_scissor_bug &&
>     > +                   (sctx->context_roll ||
>     > +                    si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
>     > +                       sctx->scissors.dirty_mask = (1 <<
>     SI_MAX_VIEWPORTS) - 1;
>     > +                       sctx->atoms.s.scissors.emit(sctx);
>     > +               }
>     > +               sctx->dirty_atoms = 0;
>     > +
>     >                 si_emit_draw_packets(sctx, info, indexbuf, index_size,
>     index_offset);
>     > 
>     >                 /* Prefetch the remaining shaders after the draw has been
>     >                  * started. */
>     >                 if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
>     >                         cik_emit_prefetch_L2(sctx, false);
>     >         }
>     > 
>     > +       /* Clear the context roll flag after the draw call. */
>     > +       sctx->context_roll = false;
>     > +
>     >         if (unlikely(sctx->current_saved_cs)) {
>     >                 si_trace_emit(sctx);
>     >                 si_log_draw_state(sctx, sctx->log);
>     >         }
>     > 
>     >         /* Workaround for a VGT hang when streamout is enabled.
>     >          * It must be done after drawing. */
>     >         if ((sctx->family == CHIP_HAWAII ||
>     >              sctx->family == CHIP_TONGA ||
>     >              sctx->family == CHIP_FIJI) &&
>     > diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/
>     gallium/drivers/radeonsi/si_state_shaders.c
>     > index 5bdfd4f6ac1..d00bb170981 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
>     > @@ -569,21 +569,21 @@ static void si_emit_shader_es(struct si_context
>     *sctx)
>     >                 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
>     >                                            SI_TRACKED_VGT_TF_PARAM,
>     >                                            shader->vgt_tf_param);
>     > 
>     >         if (shader->vgt_vertex_reuse_block_cntl)
>     >                 radeon_opt_set_context_reg(sctx,
>     R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                           
>     SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                            shader->
>     vgt_vertex_reuse_block_cntl);
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  static void si_shader_es(struct si_screen *sscreen, struct si_shader
>     *shader)
>     >  {
>     >         struct si_pm4_state *pm4;
>     >         unsigned num_user_sgprs;
>     >         unsigned vgpr_comp_cnt;
>     >         uint64_t va;
>     >         unsigned oc_lds_en;
>     > 
>     > @@ -818,21 +818,21 @@ static void si_emit_shader_gs(struct si_context
>     *sctx)
>     >                         radeon_opt_set_context_reg(sctx,
>     R_028B6C_VGT_TF_PARAM,
>     >                                                   
>     SI_TRACKED_VGT_TF_PARAM,
>     >                                                    shader->vgt_tf_param);
>     >                 if (shader->vgt_vertex_reuse_block_cntl)
>     >                         radeon_opt_set_context_reg(sctx,
>     R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                                   
>     SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                                    shader->
>     vgt_vertex_reuse_block_cntl);
>     >         }
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  static void si_shader_gs(struct si_screen *sscreen, struct si_shader
>     *shader)
>     >  {
>     >         struct si_shader_selector *sel = shader->selector;
>     >         const ubyte *num_components = sel->
>     info.num_stream_output_components;
>     >         unsigned gs_num_invocations = sel->gs_num_invocations;
>     >         struct si_pm4_state *pm4;
>     >         uint64_t va;
>     >         unsigned max_stream = sel->max_gs_stream;
>     > @@ -995,21 +995,21 @@ static void si_emit_shader_vs(struct si_context
>     *sctx)
>     >                 radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
>     >                                            SI_TRACKED_VGT_TF_PARAM,
>     >                                            shader->vgt_tf_param);
>     > 
>     >         if (shader->vgt_vertex_reuse_block_cntl)
>     >                 radeon_opt_set_context_reg(sctx,
>     R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                           
>     SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
>     >                                            shader->
>     vgt_vertex_reuse_block_cntl);
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  /**
>     >   * Compute the state for \p shader, which will run as a vertex shader on
>     the
>     >   * hardware.
>     >   *
>     >   * If \p gs is non-NULL, it points to the geometry shader for which this
>     shader
>     >   * is the copy shader.
>     >   */
>     >  static void si_shader_vs(struct si_screen *sscreen, struct si_shader
>     *shader,
>     > @@ -1187,21 +1187,21 @@ static void si_emit_shader_ps(struct si_context
>     *sctx)
>     >         radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
>     >                                     SI_TRACKED_SPI_SHADER_Z_FORMAT,
>     >                                     shader->
>     ctx_reg.ps.spi_shader_z_format,
>     >                                     shader->
>     ctx_reg.ps.spi_shader_col_format);
>     > 
>     >         radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
>     >                                    SI_TRACKED_CB_SHADER_MASK,
>     >                                    shader->ctx_reg.ps.cb_shader_mask);
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  static void si_shader_ps(struct si_shader *shader)
>     >  {
>     >         struct tgsi_shader_info *info = &shader->selector->info;
>     >         struct si_pm4_state *pm4;
>     >         unsigned spi_ps_in_control, spi_shader_col_format,
>     cb_shader_mask;
>     >         unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
>     >         uint64_t va;
>     >         unsigned input_ena = shader->config.spi_ps_input_ena;
>     > @@ -2863,21 +2863,21 @@ static void si_emit_spi_map(struct si_context
>     *sctx)
>     > 
>     >         /* R_028644_SPI_PS_INPUT_CNTL_0 */
>     >         /* Dota 2: Only ~16% of SPI map updates set different values. */
>     >         /* Talos: Only ~9% of SPI map updates set different values. */
>     >         unsigned initial_cdw = sctx->gfx_cs->current.cdw;
>     >         radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
>     >                                     spi_ps_input_cntl,
>     >                                     sctx->tracked_regs.spi_ps_input_cntl,
>     num_interp);
>     > 
>     >         if (initial_cdw != sctx->gfx_cs->current.cdw)
>     > -               sctx->context_roll_counter++;
>     > +               sctx->context_roll = true;
>     >  }
>     > 
>     >  /**
>     >   * Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before
>     that.
>     >   */
>     >  static void si_init_config_add_vgt_flush(struct si_context *sctx)
>     >  {
>     >         if (sctx->init_config_has_vgt_flush)
>     >                 return;
>     > 
>     > diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/
>     gallium/drivers/radeonsi/si_state_streamout.c
>     > index 2bf6862c89b..2a0a4bef9a2 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state_streamout.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
>     > @@ -296,20 +296,21 @@ void si_emit_streamout_end(struct si_context *sctx)
>     >                 radeon_add_to_buffer_list(sctx,  sctx->gfx_cs,
>     >                                           t[i]->buf_filled_size,
>     >                                           RADEON_USAGE_WRITE,
>     >                                           RADEON_PRIO_SO_FILLED_SIZE);
>     > 
>     >                 /* Zero the buffer size. The counters (primitives
>     generated,
>     >                  * primitives emitted) may be enabled even if there is
>     not
>     >                  * buffer bound. This ensures that the primitives-emitted
>     query
>     >                  * won't increment. */
>     >                 radeon_set_context_reg(cs,
>     R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
>     > +               sctx->context_roll = true;
>     > 
>     >                 t[i]->buf_filled_size_valid = true;
>     >         }
>     > 
>     >         sctx->streamout.begin_emitted = false;
>     >  }
>     > 
>     >  /* STREAMOUT CONFIG DERIVED STATE
>     >   *
>     >   * Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
>     > diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/
>     gallium/drivers/radeonsi/si_state_viewport.c
>     > index f988da4520b..6f348a9b58d 100644
>     > --- a/src/gallium/drivers/radeonsi/si_state_viewport.c
>     > +++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
>     > @@ -276,21 +276,21 @@ static void si_emit_guardband(struct si_context
>     *ctx)
>     >         radeon_opt_set_context_reg(ctx,
>     R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
>     >                                   
>     SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
>     >                                    S_028234_HW_SCREEN_OFFSET_X
>     (hw_screen_offset_x >> 4) |
>     >                                    S_028234_HW_SCREEN_OFFSET_Y
>     (hw_screen_offset_y >> 4));
>     >         radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
>     >                                    SI_TRACKED_PA_SU_VTX_CNTL,
>     >                                    S_028BE4_PIX_CENTER(rs->
>     half_pixel_center) |
>     >                                    S_028BE4_QUANT_MODE
>     (V_028BE4_X_16_8_FIXED_POINT_1_256TH +
>     >                                                       
>     vp_as_scissor.quant_mode));
>     >         if (initial_cdw != ctx->gfx_cs->current.cdw)
>     > -               ctx->context_roll_counter++;
>     > +               ctx->context_roll = true;
>     >  }
>     > 
>     >  static void si_emit_scissors(struct si_context *ctx)
>     >  {
>     >         struct radeon_cmdbuf *cs = ctx->gfx_cs;
>     >         struct pipe_scissor_state *states = ctx->scissors.states;
>     >         unsigned mask = ctx->scissors.dirty_mask;
>     >         bool scissor_enabled = ctx->queued.named.rasterizer->
>     scissor_enable;
>     > 
>     >         /* The simple case: Only 1 viewport is active. */
>     > --
>     > 2.17.1
>     >
>     > _______________________________________________
>     > mesa-dev mailing list
>     > mesa-dev at lists.freedesktop.org
>     > https://lists.freedesktop.org/mailman/listinfo/mesa-dev
> 
-------------- next part --------------
A non-text attachment was scrubbed...
Name: signature.asc
Type: application/pgp-signature
Size: 228 bytes
Desc: signature
URL: <https://lists.freedesktop.org/archives/mesa-dev/attachments/20190425/471d66e0/attachment-0001.sig>