[Mesa-dev] [PATCH 2/3] radeonsi/gfx9: rework the gfx9 scissor bug workaround (v2)
Marek Olšák
maraeo at gmail.com
Thu Apr 18 21:46:27 UTC 2019
From: Marek Olšák <marek.olsak at amd.com>
Needed to track context rolls caused by streamout and ACQUIRE_MEM.
ACQUIRE_MEM can occur outside of draw calls.
Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=110355
v2: squashed patches and done more rework
Cc: 19.0 <mesa-stable at lists.freedesktop.org>
---
src/gallium/drivers/radeonsi/si_pipe.c | 2 +
src/gallium/drivers/radeonsi/si_pipe.h | 3 +-
src/gallium/drivers/radeonsi/si_state.c | 8 +-
.../drivers/radeonsi/si_state_binning.c | 4 +-
src/gallium/drivers/radeonsi/si_state_draw.c | 86 +++++++++++--------
.../drivers/radeonsi/si_state_shaders.c | 10 +--
.../drivers/radeonsi/si_state_streamout.c | 1 +
.../drivers/radeonsi/si_state_viewport.c | 2 +-
8 files changed, 68 insertions(+), 48 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index fa96ce34224..7209db9fb37 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -1072,20 +1072,22 @@ struct pipe_screen *radeonsi_screen_create(struct radeon_winsys *ws,
sscreen->has_out_of_order_rast = sscreen->info.chip_class >= VI &&
sscreen->info.max_se >= 2 &&
!(sscreen->debug_flags & DBG(NO_OUT_OF_ORDER));
sscreen->assume_no_z_fights =
driQueryOptionb(config->options, "radeonsi_assume_no_z_fights");
sscreen->commutative_blend_add =
driQueryOptionb(config->options, "radeonsi_commutative_blend_add");
sscreen->clear_db_cache_before_clear =
driQueryOptionb(config->options, "radeonsi_clear_db_cache_before_clear");
+ sscreen->has_gfx9_scissor_bug = sscreen->info.family == CHIP_VEGA10 ||
+ sscreen->info.family == CHIP_RAVEN;
sscreen->has_msaa_sample_loc_bug = (sscreen->info.family >= CHIP_POLARIS10 &&
sscreen->info.family <= CHIP_POLARIS12) ||
sscreen->info.family == CHIP_VEGA10 ||
sscreen->info.family == CHIP_RAVEN;
sscreen->has_ls_vgpr_init_bug = sscreen->info.family == CHIP_VEGA10 ||
sscreen->info.family == CHIP_RAVEN;
sscreen->has_dcc_constant_encode = sscreen->info.family == CHIP_RAVEN2;
/* Only enable primitive binning on APUs by default. */
sscreen->dpbb_allowed = sscreen->info.family == CHIP_RAVEN ||
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index aaa95f32d20..a4c90a4f69f 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -463,20 +463,21 @@ struct si_screen {
unsigned eqaa_force_coverage_samples;
unsigned eqaa_force_z_samples;
unsigned eqaa_force_color_samples;
bool has_clear_state;
bool has_distributed_tess;
bool has_draw_indirect_multi;
bool has_out_of_order_rast;
bool assume_no_z_fights;
bool commutative_blend_add;
bool clear_db_cache_before_clear;
+ bool has_gfx9_scissor_bug;
bool has_msaa_sample_loc_bug;
bool has_ls_vgpr_init_bug;
bool has_dcc_constant_encode;
bool dpbb_allowed;
bool dfsm_allowed;
bool llvm_has_working_vgpr_indexing;
/* Whether shaders are monolithic (1-part) or separate (3-part). */
bool use_monolithic_shaders;
bool record_llvm_ir;
@@ -1062,21 +1063,21 @@ struct si_context {
unsigned num_vs_flushes;
unsigned num_ps_flushes;
unsigned num_cs_flushes;
unsigned num_cb_cache_flushes;
unsigned num_db_cache_flushes;
unsigned num_L2_invalidates;
unsigned num_L2_writebacks;
unsigned num_resident_handles;
uint64_t num_alloc_tex_transfer_bytes;
unsigned last_tex_ps_draw_ratio; /* for query */
- unsigned context_roll_counter;
+ unsigned context_roll;
/* Queries. */
/* Maintain the list of active queries for pausing between IBs. */
int num_occlusion_queries;
int num_perfect_occlusion_queries;
struct list_head active_queries;
unsigned num_cs_dw_queries_suspend;
/* Render condition. */
struct pipe_query *render_cond;
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 757c17f7df8..bc7e777ad73 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -249,21 +249,21 @@ static void si_emit_cb_render_state(struct si_context *sctx)
}
}
/* SX_PS_DOWNCONVERT, SX_BLEND_OPT_EPSILON, SX_BLEND_OPT_CONTROL */
radeon_opt_set_context_reg3(sctx, R_028754_SX_PS_DOWNCONVERT,
SI_TRACKED_SX_PS_DOWNCONVERT,
sx_ps_downconvert, sx_blend_opt_epsilon,
sx_blend_opt_control);
}
if (initial_cdw != cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
* Blender functions
*/
static uint32_t si_translate_blend_function(int blend_func)
{
switch (blend_func) {
case PIPE_BLEND_ADD:
@@ -786,21 +786,21 @@ static void si_emit_clip_regs(struct si_context *sctx)
S_02881C_VS_OUT_CCDIST0_VEC_ENA((total_mask & 0x0F) != 0) |
S_02881C_VS_OUT_CCDIST1_VEC_ENA((total_mask & 0xF0) != 0) |
clipdist_mask | (culldist_mask << 8));
radeon_opt_set_context_reg(sctx, R_028810_PA_CL_CLIP_CNTL,
SI_TRACKED_PA_CL_CLIP_CNTL,
rs->pa_cl_clip_cntl |
ucp_mask |
S_028810_CLIP_DISABLE(window_space));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
* inferred state between framebuffer and rasterizer
*/
static void si_update_poly_offset_state(struct si_context *sctx)
{
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
if (!rs || !rs->uses_poly_offset || !sctx->framebuffer.state.zsbuf) {
@@ -1448,21 +1448,21 @@ static void si_emit_db_render_state(struct si_context *sctx)
db_shader_control &= C_02880C_MASK_EXPORT_ENABLE;
if (sctx->screen->has_rbplus &&
!sctx->screen->rbplus_allowed)
db_shader_control |= S_02880C_DUAL_QUAD_DISABLE(1);
radeon_opt_set_context_reg(sctx, R_02880C_DB_SHADER_CONTROL,
SI_TRACKED_DB_SHADER_CONTROL, db_shader_control);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/*
* format translation
*/
static uint32_t si_translate_colorformat(enum pipe_format format)
{
const struct util_format_description *desc = util_format_description(format);
if (!desc)
return V_028C70_COLOR_INVALID;
@@ -3537,21 +3537,21 @@ static void si_emit_msaa_config(struct si_context *sctx)
SI_TRACKED_PA_SC_LINE_CNTL, sc_line_cntl,
sc_aa_config);
/* R_028804_DB_EQAA */
radeon_opt_set_context_reg(sctx, R_028804_DB_EQAA, SI_TRACKED_DB_EQAA,
db_eqaa);
/* R_028A4C_PA_SC_MODE_CNTL_1 */
radeon_opt_set_context_reg(sctx, R_028A4C_PA_SC_MODE_CNTL_1,
SI_TRACKED_PA_SC_MODE_CNTL_1, sc_mode_cntl_1);
if (initial_cdw != cs->current.cdw) {
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
/* GFX9: Flush DFSM when the AA mode changes. */
if (sctx->screen->dfsm_allowed) {
radeon_emit(cs, PKT3(PKT3_EVENT_WRITE, 0, 0));
radeon_emit(cs, EVENT_TYPE(V_028A90_FLUSH_DFSM) | EVENT_INDEX(0));
}
}
}
void si_update_ps_iter_samples(struct si_context *sctx)
diff --git a/src/gallium/drivers/radeonsi/si_state_binning.c b/src/gallium/drivers/radeonsi/si_state_binning.c
index 3516e561282..5c6c2e69b90 100644
--- a/src/gallium/drivers/radeonsi/si_state_binning.c
+++ b/src/gallium/drivers/radeonsi/si_state_binning.c
@@ -314,21 +314,21 @@ static void si_emit_dpbb_disable(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028C44_PA_SC_BINNER_CNTL_0,
SI_TRACKED_PA_SC_BINNER_CNTL_0,
S_028C44_BINNING_MODE(V_028C44_DISABLE_BINNING_USE_LEGACY_SC) |
S_028C44_DISABLE_START_OF_PRIM(1));
radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
SI_TRACKED_DB_DFSM_CONTROL,
S_028060_PUNCHOUT_MODE(V_028060_FORCE_OFF) |
S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
void si_emit_dpbb_state(struct si_context *sctx)
{
struct si_screen *sscreen = sctx->screen;
struct si_state_blend *blend = sctx->queued.named.blend;
struct si_state_dsa *dsa = sctx->queued.named.dsa;
unsigned db_shader_control = sctx->ps_db_shader_control;
assert(sctx->chip_class >= GFX9);
@@ -436,12 +436,12 @@ void si_emit_dpbb_state(struct si_context *sctx)
S_028C44_CONTEXT_STATES_PER_BIN(context_states_per_bin) |
S_028C44_PERSISTENT_STATES_PER_BIN(persistent_states_per_bin) |
S_028C44_DISABLE_START_OF_PRIM(disable_start_of_prim) |
S_028C44_FPOVS_PER_BATCH(fpovs_per_batch) |
S_028C44_OPTIMAL_BIN_SELECTION(1));
radeon_opt_set_context_reg(sctx, R_028060_DB_DFSM_CONTROL,
SI_TRACKED_DB_DFSM_CONTROL,
S_028060_PUNCHOUT_MODE(punchout_mode) |
S_028060_POPS_DRAIN_PS_ON_OVERLAP(1));
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
index 2a514f144b9..8798f9ad0a0 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.c
+++ b/src/gallium/drivers/radeonsi/si_state_draw.c
@@ -59,21 +59,21 @@ static unsigned si_conv_pipe_prim(unsigned mode)
return prim_conv[mode];
}
/**
* This calculates the LDS size for tessellation shaders (VS, TCS, TES).
* LS.LDS_SIZE is shared by all 3 shader stages.
*
* The information about LDS and other non-compile-time parameters is then
* written to userdata SGPRs.
*/
-static bool si_emit_derived_tess_state(struct si_context *sctx,
+static void si_emit_derived_tess_state(struct si_context *sctx,
const struct pipe_draw_info *info,
unsigned *num_patches)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
struct si_shader *ls_current;
struct si_shader_selector *ls;
/* The TES pointer will only be used for sctx->last_tcs.
* It would be wrong to think that TCS = TES. */
struct si_shader_selector *tcs =
sctx->tcs_shader.cso ? sctx->tcs_shader.cso : sctx->tes_shader.cso;
@@ -103,21 +103,21 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
ls = sctx->vs_shader.cso;
}
if (sctx->last_ls == ls_current &&
sctx->last_tcs == tcs &&
sctx->last_tes_sh_base == tes_sh_base &&
sctx->last_num_tcs_input_cp == num_tcs_input_cp &&
(!has_primid_instancing_bug ||
(sctx->last_tess_uses_primid == tess_uses_primid))) {
*num_patches = sctx->last_num_patches;
- return false;
+ return;
}
sctx->last_ls = ls_current;
sctx->last_tcs = tcs;
sctx->last_tes_sh_base = tes_sh_base;
sctx->last_num_tcs_input_cp = num_tcs_input_cp;
sctx->last_tess_uses_primid = tess_uses_primid;
/* This calculates how shader inputs and outputs among VS, TCS, and TES
* are laid out in LDS. */
@@ -298,23 +298,22 @@ static bool si_emit_derived_tess_state(struct si_context *sctx,
if (sctx->last_ls_hs_config != ls_hs_config) {
if (sctx->chip_class >= CIK) {
radeon_set_context_reg_idx(cs, R_028B58_VGT_LS_HS_CONFIG, 2,
ls_hs_config);
} else {
radeon_set_context_reg(cs, R_028B58_VGT_LS_HS_CONFIG,
ls_hs_config);
}
sctx->last_ls_hs_config = ls_hs_config;
- return true; /* true if the context rolls */
+ sctx->context_roll = true;
}
- return false;
}
static unsigned si_num_prims_for_vertices(const struct pipe_draw_info *info)
{
switch (info->mode) {
case PIPE_PRIM_PATCHES:
return info->count / info->vertices_per_patch;
case PIPE_PRIM_POLYGON:
return info->count >= 3;
case SI_PRIM_RECTANGLE_LIST:
@@ -534,44 +533,44 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
(info->instance_count > 1 &&
(info->count_from_stream_output ||
si_num_prims_for_vertices(info) <= 1))))
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
}
return ia_multi_vgt_param;
}
/* rast_prim is the primitive type after GS. */
-static bool si_emit_rasterizer_prim_state(struct si_context *sctx)
+static void si_emit_rasterizer_prim_state(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
enum pipe_prim_type rast_prim = sctx->current_rast_prim;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
/* Skip this if not rendering lines. */
if (!util_prim_is_lines(rast_prim))
- return false;
+ return;
if (rast_prim == sctx->last_rast_prim &&
rs->pa_sc_line_stipple == sctx->last_sc_line_stipple)
- return false;
+ return;
/* For lines, reset the stipple pattern at each primitive. Otherwise,
* reset the stipple pattern at each packet (line strips, line loops).
*/
radeon_set_context_reg(cs, R_028A0C_PA_SC_LINE_STIPPLE,
rs->pa_sc_line_stipple |
S_028A0C_AUTO_RESET_CNTL(rast_prim == PIPE_PRIM_LINES ? 1 : 2));
sctx->last_rast_prim = rast_prim;
sctx->last_sc_line_stipple = rs->pa_sc_line_stipple;
- return true; /* true if the context rolls */
+ sctx->context_roll = true;
}
static void si_emit_vs_state(struct si_context *sctx,
const struct pipe_draw_info *info)
{
sctx->current_vs_state &= C_VS_STATE_INDEXED;
sctx->current_vs_state |= S_VS_STATE_INDEXED(!!info->index_size);
if (sctx->num_vs_blit_sgprs) {
/* Re-emit the state after we leave u_blitter. */
@@ -652,20 +651,21 @@ static void si_emit_draw_registers(struct si_context *sctx,
radeon_set_context_reg(cs, R_028A94_VGT_MULTI_PRIM_IB_RESET_EN,
info->primitive_restart);
sctx->last_primitive_restart_en = info->primitive_restart;
}
if (si_prim_restart_index_changed(sctx, info)) {
radeon_set_context_reg(cs, R_02840C_VGT_MULTI_PRIM_IB_RESET_INDX,
info->restart_index);
sctx->last_restart_index = info->restart_index;
+ sctx->context_roll = true;
}
}
static void si_emit_draw_packets(struct si_context *sctx,
const struct pipe_draw_info *info,
struct pipe_resource *indexbuf,
unsigned index_size,
unsigned index_offset)
{
struct pipe_draw_indirect_info *indirect = info->indirect;
@@ -889,20 +889,25 @@ static void si_emit_surface_sync(struct si_context *sctx,
radeon_emit(cs, 0); /* CP_COHER_BASE_HI */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
} else {
/* ACQUIRE_MEM is only required on a compute ring. */
radeon_emit(cs, PKT3(PKT3_SURFACE_SYNC, 3, 0));
radeon_emit(cs, cp_coher_cntl); /* CP_COHER_CNTL */
radeon_emit(cs, 0xffffffff); /* CP_COHER_SIZE */
radeon_emit(cs, 0); /* CP_COHER_BASE */
radeon_emit(cs, 0x0000000A); /* POLL_INTERVAL */
}
+
+ /* ACQUIRE_MEM has an implicit context roll if the current context
+ * is busy. */
+ if (sctx->has_graphics)
+ sctx->context_roll = true;
}
void si_emit_cache_flush(struct si_context *sctx)
{
struct radeon_cmdbuf *cs = sctx->gfx_cs;
uint32_t flags = sctx->flags;
if (!sctx->has_graphics) {
/* Only process compute flags. */
flags &= SI_CONTEXT_INV_ICACHE |
@@ -1216,40 +1221,24 @@ static void si_get_draw_start_count(struct si_context *sctx,
} else {
*start = info->start;
*count = info->count;
}
}
static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
unsigned skip_atom_mask)
{
unsigned num_patches = 0;
- /* Vega10/Raven scissor bug workaround. When any context register is
- * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
- * registers must be written too.
- */
- bool handle_scissor_bug = (sctx->family == CHIP_VEGA10 || sctx->family == CHIP_RAVEN) &&
- !si_is_atom_dirty(sctx, &sctx->atoms.s.scissors);
- bool context_roll = false; /* set correctly for GFX9 only */
- context_roll |= si_emit_rasterizer_prim_state(sctx);
+ si_emit_rasterizer_prim_state(sctx);
if (sctx->tes_shader.cso)
- context_roll |= si_emit_derived_tess_state(sctx, info, &num_patches);
-
- if (handle_scissor_bug &&
- (info->count_from_stream_output ||
- sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
- sctx->dirty_states & si_states_that_always_roll_context() ||
- si_prim_restart_index_changed(sctx, info)))
- context_roll = true;
-
- sctx->context_roll_counter = 0;
+ si_emit_derived_tess_state(sctx, info, &num_patches);
/* Emit state atoms. */
unsigned mask = sctx->dirty_atoms & ~skip_atom_mask;
while (mask)
sctx->atoms.array[u_bit_scan(&mask)].emit(sctx);
sctx->dirty_atoms &= skip_atom_mask;
/* Emit states. */
mask = sctx->dirty_states;
@@ -1258,26 +1247,20 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
struct si_pm4_state *state = sctx->queued.array[i];
if (!state || sctx->emitted.array[i] == state)
continue;
si_pm4_emit(sctx, state);
sctx->emitted.array[i] = state;
}
sctx->dirty_states = 0;
- if (handle_scissor_bug &&
- (context_roll || sctx->context_roll_counter)) {
- sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
- sctx->atoms.s.scissors.emit(sctx);
- }
-
/* Emit draw states. */
si_emit_vs_state(sctx, info);
si_emit_draw_registers(sctx, info, num_patches);
}
static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_state_rasterizer *rs = sctx->queued.named.rasterizer;
struct pipe_resource *indexbuf = info->index.resource;
@@ -1462,45 +1445,66 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
si_need_gfx_cs_space(sctx);
/* Since we've called si_context_add_resource_size for vertex buffers,
* this must be called after si_need_cs_space, because we must let
* need_cs_space flush before we add buffers to the buffer list.
*/
if (!si_upload_vertex_buffer_descriptors(sctx))
goto return_cleanup;
+ /* Vega10/Raven scissor bug workaround. When any context register is
+ * written (i.e. the GPU rolls the context), PA_SC_VPORT_SCISSOR
+ * registers must be written too.
+ */
+ bool has_gfx9_scissor_bug = sctx->screen->has_gfx9_scissor_bug;
+ unsigned masked_atoms = 0;
+
+ if (has_gfx9_scissor_bug) {
+ masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
+
+ if (info->count_from_stream_output ||
+ sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
+ sctx->dirty_states & si_states_that_always_roll_context())
+ sctx->context_roll = true;
+ }
+
/* Use optimal packet order based on whether we need to sync the pipeline. */
if (unlikely(sctx->flags & (SI_CONTEXT_FLUSH_AND_INV_CB |
SI_CONTEXT_FLUSH_AND_INV_DB |
SI_CONTEXT_PS_PARTIAL_FLUSH |
SI_CONTEXT_CS_PARTIAL_FLUSH))) {
/* If we have to wait for idle, set all states first, so that all
* SET packets are processed in parallel with previous draw calls.
* Then draw and prefetch at the end. This ensures that the time
* the CUs are idle is very short.
*/
- unsigned masked_atoms = 0;
-
if (unlikely(sctx->flags & SI_CONTEXT_FLUSH_FOR_RENDER_COND))
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
if (!si_upload_graphics_shader_descriptors(sctx))
goto return_cleanup;
/* Emit all states except possibly render condition. */
si_emit_all_states(sctx, info, masked_atoms);
si_emit_cache_flush(sctx);
/* <-- CUs are idle here. */
if (si_is_atom_dirty(sctx, &sctx->atoms.s.render_cond))
sctx->atoms.s.render_cond.emit(sctx);
+
+ if (has_gfx9_scissor_bug &&
+ (sctx->context_roll ||
+ si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+ sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ sctx->atoms.s.scissors.emit(sctx);
+ }
sctx->dirty_atoms = 0;
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
/* <-- CUs are busy here. */
/* Start prefetches after the draw has been started. Both will run
* in parallel, but starting the draw first is more important.
*/
if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, false);
@@ -1511,29 +1515,41 @@ static void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *i
if (sctx->flags)
si_emit_cache_flush(sctx);
/* Only prefetch the API VS and VBO descriptors. */
if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, true);
if (!si_upload_graphics_shader_descriptors(sctx))
return;
- si_emit_all_states(sctx, info, 0);
+ si_emit_all_states(sctx, info, masked_atoms);
+
+ if (has_gfx9_scissor_bug &&
+ (sctx->context_roll ||
+ si_is_atom_dirty(sctx, &sctx->atoms.s.scissors))) {
+ sctx->scissors.dirty_mask = (1 << SI_MAX_VIEWPORTS) - 1;
+ sctx->atoms.s.scissors.emit(sctx);
+ }
+ sctx->dirty_atoms = 0;
+
si_emit_draw_packets(sctx, info, indexbuf, index_size, index_offset);
/* Prefetch the remaining shaders after the draw has been
* started. */
if (sctx->chip_class >= CIK && sctx->prefetch_L2_mask)
cik_emit_prefetch_L2(sctx, false);
}
+ /* Clear the context roll flag after the draw call. */
+ sctx->context_roll = false;
+
if (unlikely(sctx->current_saved_cs)) {
si_trace_emit(sctx);
si_log_draw_state(sctx, sctx->log);
}
/* Workaround for a VGT hang when streamout is enabled.
* It must be done after drawing. */
if ((sctx->family == CHIP_HAWAII ||
sctx->family == CHIP_TONGA ||
sctx->family == CHIP_FIJI) &&
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 5bdfd4f6ac1..d00bb170981 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -569,21 +569,21 @@ static void si_emit_shader_es(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
SI_TRACKED_VGT_TF_PARAM,
shader->vgt_tf_param);
if (shader->vgt_vertex_reuse_block_cntl)
radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
shader->vgt_vertex_reuse_block_cntl);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_es(struct si_screen *sscreen, struct si_shader *shader)
{
struct si_pm4_state *pm4;
unsigned num_user_sgprs;
unsigned vgpr_comp_cnt;
uint64_t va;
unsigned oc_lds_en;
@@ -818,21 +818,21 @@ static void si_emit_shader_gs(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
SI_TRACKED_VGT_TF_PARAM,
shader->vgt_tf_param);
if (shader->vgt_vertex_reuse_block_cntl)
radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
shader->vgt_vertex_reuse_block_cntl);
}
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_gs(struct si_screen *sscreen, struct si_shader *shader)
{
struct si_shader_selector *sel = shader->selector;
const ubyte *num_components = sel->info.num_stream_output_components;
unsigned gs_num_invocations = sel->gs_num_invocations;
struct si_pm4_state *pm4;
uint64_t va;
unsigned max_stream = sel->max_gs_stream;
@@ -995,21 +995,21 @@ static void si_emit_shader_vs(struct si_context *sctx)
radeon_opt_set_context_reg(sctx, R_028B6C_VGT_TF_PARAM,
SI_TRACKED_VGT_TF_PARAM,
shader->vgt_tf_param);
if (shader->vgt_vertex_reuse_block_cntl)
radeon_opt_set_context_reg(sctx, R_028C58_VGT_VERTEX_REUSE_BLOCK_CNTL,
SI_TRACKED_VGT_VERTEX_REUSE_BLOCK_CNTL,
shader->vgt_vertex_reuse_block_cntl);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/**
* Compute the state for \p shader, which will run as a vertex shader on the
* hardware.
*
* If \p gs is non-NULL, it points to the geometry shader for which this shader
* is the copy shader.
*/
static void si_shader_vs(struct si_screen *sscreen, struct si_shader *shader,
@@ -1187,21 +1187,21 @@ static void si_emit_shader_ps(struct si_context *sctx)
radeon_opt_set_context_reg2(sctx, R_028710_SPI_SHADER_Z_FORMAT,
SI_TRACKED_SPI_SHADER_Z_FORMAT,
shader->ctx_reg.ps.spi_shader_z_format,
shader->ctx_reg.ps.spi_shader_col_format);
radeon_opt_set_context_reg(sctx, R_02823C_CB_SHADER_MASK,
SI_TRACKED_CB_SHADER_MASK,
shader->ctx_reg.ps.cb_shader_mask);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
static void si_shader_ps(struct si_shader *shader)
{
struct tgsi_shader_info *info = &shader->selector->info;
struct si_pm4_state *pm4;
unsigned spi_ps_in_control, spi_shader_col_format, cb_shader_mask;
unsigned spi_baryc_cntl = S_0286E0_FRONT_FACE_ALL_BITS(1);
uint64_t va;
unsigned input_ena = shader->config.spi_ps_input_ena;
@@ -2863,21 +2863,21 @@ static void si_emit_spi_map(struct si_context *sctx)
/* R_028644_SPI_PS_INPUT_CNTL_0 */
/* Dota 2: Only ~16% of SPI map updates set different values. */
/* Talos: Only ~9% of SPI map updates set different values. */
unsigned initial_cdw = sctx->gfx_cs->current.cdw;
radeon_opt_set_context_regn(sctx, R_028644_SPI_PS_INPUT_CNTL_0,
spi_ps_input_cntl,
sctx->tracked_regs.spi_ps_input_cntl, num_interp);
if (initial_cdw != sctx->gfx_cs->current.cdw)
- sctx->context_roll_counter++;
+ sctx->context_roll = true;
}
/**
* Writing CONFIG or UCONFIG VGT registers requires VGT_FLUSH before that.
*/
static void si_init_config_add_vgt_flush(struct si_context *sctx)
{
if (sctx->init_config_has_vgt_flush)
return;
diff --git a/src/gallium/drivers/radeonsi/si_state_streamout.c b/src/gallium/drivers/radeonsi/si_state_streamout.c
index 2bf6862c89b..2a0a4bef9a2 100644
--- a/src/gallium/drivers/radeonsi/si_state_streamout.c
+++ b/src/gallium/drivers/radeonsi/si_state_streamout.c
@@ -296,20 +296,21 @@ void si_emit_streamout_end(struct si_context *sctx)
radeon_add_to_buffer_list(sctx, sctx->gfx_cs,
t[i]->buf_filled_size,
RADEON_USAGE_WRITE,
RADEON_PRIO_SO_FILLED_SIZE);
/* Zero the buffer size. The counters (primitives generated,
* primitives emitted) may be enabled even if there is not
* buffer bound. This ensures that the primitives-emitted query
* won't increment. */
radeon_set_context_reg(cs, R_028AD0_VGT_STRMOUT_BUFFER_SIZE_0 + 16*i, 0);
+ sctx->context_roll = true;
t[i]->buf_filled_size_valid = true;
}
sctx->streamout.begin_emitted = false;
}
/* STREAMOUT CONFIG DERIVED STATE
*
* Streamout must be enabled for the PRIMITIVES_GENERATED query to work.
diff --git a/src/gallium/drivers/radeonsi/si_state_viewport.c b/src/gallium/drivers/radeonsi/si_state_viewport.c
index f988da4520b..6f348a9b58d 100644
--- a/src/gallium/drivers/radeonsi/si_state_viewport.c
+++ b/src/gallium/drivers/radeonsi/si_state_viewport.c
@@ -276,21 +276,21 @@ static void si_emit_guardband(struct si_context *ctx)
radeon_opt_set_context_reg(ctx, R_028234_PA_SU_HARDWARE_SCREEN_OFFSET,
SI_TRACKED_PA_SU_HARDWARE_SCREEN_OFFSET,
S_028234_HW_SCREEN_OFFSET_X(hw_screen_offset_x >> 4) |
S_028234_HW_SCREEN_OFFSET_Y(hw_screen_offset_y >> 4));
radeon_opt_set_context_reg(ctx, R_028BE4_PA_SU_VTX_CNTL,
SI_TRACKED_PA_SU_VTX_CNTL,
S_028BE4_PIX_CENTER(rs->half_pixel_center) |
S_028BE4_QUANT_MODE(V_028BE4_X_16_8_FIXED_POINT_1_256TH +
vp_as_scissor.quant_mode));
if (initial_cdw != ctx->gfx_cs->current.cdw)
- ctx->context_roll_counter++;
+ ctx->context_roll = true;
}
static void si_emit_scissors(struct si_context *ctx)
{
struct radeon_cmdbuf *cs = ctx->gfx_cs;
struct pipe_scissor_state *states = ctx->scissors.states;
unsigned mask = ctx->scissors.dirty_mask;
bool scissor_enabled = ctx->queued.named.rasterizer->scissor_enable;
/* The simple case: Only 1 viewport is active. */
--
2.17.1
More information about the mesa-dev
mailing list