Mesa (main): radeonsi: implement draw_vertex_state for lower display list overhead
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Oct 1 15:45:22 UTC 2021
Module: Mesa
Branch: main
Commit: fb8f532ea1bbd9c959e0f59c652347e435a71f91
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=fb8f532ea1bbd9c959e0f59c652347e435a71f91
Author: Marek Olšák <marek.olsak at amd.com>
Date: Tue Aug 17 13:59:44 2021 -0400
radeonsi: implement draw_vertex_state for lower display list overhead
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer at amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/13050>
---
src/gallium/drivers/radeonsi/si_get.c | 3 +
src/gallium/drivers/radeonsi/si_gfx_cs.c | 40 ++-
src/gallium/drivers/radeonsi/si_pipe.c | 2 +
src/gallium/drivers/radeonsi/si_pipe.h | 37 ++-
src/gallium/drivers/radeonsi/si_state.c | 123 ++++++++-
src/gallium/drivers/radeonsi/si_state_draw.cpp | 318 +++++++++++++++++++-----
src/gallium/drivers/radeonsi/si_state_shaders.c | 28 ++-
7 files changed, 458 insertions(+), 93 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_get.c b/src/gallium/drivers/radeonsi/si_get.c
index 166102db5f5..4a4fdc91b65 100644
--- a/src/gallium/drivers/radeonsi/si_get.c
+++ b/src/gallium/drivers/radeonsi/si_get.c
@@ -164,6 +164,9 @@ static int si_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_TGSI_ATOMINC_WRAP:
return 1;
+ case PIPE_CAP_DRAW_VERTEX_STATE:
+ return !(sscreen->debug_flags & DBG(NO_FAST_DISPLAY_LIST));
+
case PIPE_CAP_GLSL_ZERO_INIT:
return 2;
diff --git a/src/gallium/drivers/radeonsi/si_gfx_cs.c b/src/gallium/drivers/radeonsi/si_gfx_cs.c
index f44f8e1eb65..9cb7cd0f813 100644
--- a/src/gallium/drivers/radeonsi/si_gfx_cs.c
+++ b/src/gallium/drivers/radeonsi/si_gfx_cs.c
@@ -298,20 +298,34 @@ void si_set_tracked_regs_to_clear_state(struct si_context *ctx)
ctx->last_gs_out_prim = 0; /* cleared by CLEAR_STATE */
}
-void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper)
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+ pipe_draw_vertex_state_func vstate_wrapper)
{
if (wrapper) {
if (wrapper != sctx->b.draw_vbo) {
- assert (!sctx->real_draw_vbo);
+ assert(!sctx->real_draw_vbo);
+ assert(!sctx->real_draw_vertex_state);
sctx->real_draw_vbo = sctx->b.draw_vbo;
+ sctx->real_draw_vertex_state = sctx->b.draw_vertex_state;
sctx->b.draw_vbo = wrapper;
+ sctx->b.draw_vertex_state = vstate_wrapper;
}
} else if (sctx->real_draw_vbo) {
sctx->real_draw_vbo = NULL;
+ sctx->real_draw_vertex_state = NULL;
si_select_draw_vbo(sctx);
}
}
+static void si_tmz_preamble(struct si_context *sctx)
+{
+ bool secure = si_gfx_resources_check_encrypted(sctx);
+ if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
+ si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
+ RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
+ }
+}
+
static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,
const struct pipe_draw_info *info,
unsigned drawid_offset,
@@ -320,15 +334,22 @@ static void si_draw_vbo_tmz_preamble(struct pipe_context *ctx,
unsigned num_draws) {
struct si_context *sctx = (struct si_context *)ctx;
- bool secure = si_gfx_resources_check_encrypted(sctx);
- if (secure != sctx->ws->cs_is_secure(&sctx->gfx_cs)) {
- si_flush_gfx_cs(sctx, RADEON_FLUSH_ASYNC_START_NEXT_GFX_IB_NOW |
- RADEON_FLUSH_TOGGLE_SECURE_SUBMISSION, NULL);
- }
-
+ si_tmz_preamble(sctx);
sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
}
+static void si_draw_vstate_tmz_preamble(struct pipe_context *ctx,
+ struct pipe_vertex_state *state,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws) {
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ si_tmz_preamble(sctx);
+ sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
{
bool is_secure = false;
@@ -336,7 +357,8 @@ void si_begin_new_gfx_cs(struct si_context *ctx, bool first_cs)
if (unlikely(radeon_uses_secure_bos(ctx->ws))) {
is_secure = ctx->ws->cs_is_secure(&ctx->gfx_cs);
- si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble);
+ si_install_draw_wrapper(ctx, si_draw_vbo_tmz_preamble,
+ si_draw_vstate_tmz_preamble);
}
if (ctx->is_debug)
diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
index 2b3400dc800..1883a1f0d55 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.c
+++ b/src/gallium/drivers/radeonsi/si_pipe.c
@@ -88,6 +88,7 @@ static const struct debug_named_value radeonsi_debug_options[] = {
{"check_vm", DBG(CHECK_VM), "Check VM faults and dump debug info."},
{"reserve_vmid", DBG(RESERVE_VMID), "Force VMID reservation per context."},
{"shadowregs", DBG(SHADOW_REGS), "Enable CP register shadowing."},
+ {"nofastdlist", DBG(NO_FAST_DISPLAY_LIST), "Disable fast display lists"},
/* 3D engine options: */
{"nogfx", DBG(NO_GFX), "Disable graphics. Only multimedia compute paths can be used."},
@@ -916,6 +917,7 @@ static void si_destroy_screen(struct pipe_screen *pscreen)
disk_cache_destroy(sscreen->disk_shader_cache);
util_live_shader_cache_deinit(&sscreen->live_shader_cache);
util_idalloc_mt_fini(&sscreen->buffer_ids);
+ util_vertex_state_cache_deinit(&sscreen->vertex_state_cache);
sscreen->ws->destroy(sscreen->ws);
FREE(sscreen);
diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
index 36aaa5fee27..5c115f33b73 100644
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@@ -31,6 +31,7 @@
#include "util/u_idalloc.h"
#include "util/u_suballoc.h"
#include "util/u_threaded_context.h"
+#include "util/u_vertex_state_cache.h"
#include "ac_sqtt.h"
#ifdef __cplusplus
@@ -210,6 +211,7 @@ enum
DBG_CHECK_VM,
DBG_RESERVE_VMID,
DBG_SHADOW_REGS,
+ DBG_NO_FAST_DISPLAY_LIST,
/* 3D engine options: */
DBG_NO_GFX,
@@ -659,6 +661,7 @@ struct si_screen {
unsigned ngg_subgroup_size;
struct util_idalloc_mt buffer_ids;
+ struct util_vertex_state_cache vertex_state_cache;
};
struct si_sampler_view {
@@ -867,12 +870,24 @@ struct si_small_prim_cull_info {
float small_prim_precision;
};
+struct si_vertex_state {
+ struct pipe_vertex_state b;
+ struct si_vertex_elements velems;
+ uint32_t descriptors[4 * SI_MAX_ATTRIBS];
+};
+
typedef void (*pipe_draw_vbo_func)(struct pipe_context *pipe,
const struct pipe_draw_info *info,
unsigned drawid_offset,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draws,
unsigned num_draws);
+typedef void (*pipe_draw_vertex_state_func)(struct pipe_context *ctx,
+ struct pipe_vertex_state *vstate,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws);
struct si_context {
struct pipe_context b; /* base class */
@@ -1011,6 +1026,8 @@ struct si_context {
struct si_vertex_elements *vertex_elements;
unsigned num_vertex_elements;
unsigned cs_max_waves_per_sh;
+ bool uses_nontrivial_vs_prolog;
+ bool force_trivial_vs_prolog;
bool do_update_shaders;
bool compute_shaderbuf_sgprs_dirty;
bool compute_image_sgprs_dirty;
@@ -1219,8 +1236,10 @@ struct si_context {
struct hash_table *dirty_implicit_resources;
pipe_draw_vbo_func draw_vbo[2][2][2];
+ pipe_draw_vertex_state_func draw_vertex_state[2][2][2];
/* When b.draw_vbo is a wrapper, real_draw_vbo is the real draw_vbo function */
pipe_draw_vbo_func real_draw_vbo;
+ pipe_draw_vertex_state_func real_draw_vertex_state;
void (*emit_spi_map[33])(struct si_context *sctx);
/* SQTT */
@@ -1422,7 +1441,8 @@ void si_emit_cache_flush(struct si_context *sctx, struct radeon_cmdbuf *cs);
/* Replace the sctx->b.draw_vbo function with a wrapper. This can be use to implement
* optimizations without affecting the normal draw_vbo functions perf.
*/
-void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper);
+void si_install_draw_wrapper(struct si_context *sctx, pipe_draw_vbo_func wrapper,
+ pipe_draw_vertex_state_func vstate_wrapper);
/* si_gpu_load.c */
void si_gpu_load_kill_thread(struct si_screen *sscreen);
@@ -1954,11 +1974,22 @@ static inline void si_select_draw_vbo(struct si_context *sctx)
pipe_draw_vbo_func draw_vbo = sctx->draw_vbo[!!sctx->shader.tes.cso]
[!!sctx->shader.gs.cso]
[sctx->ngg];
+ pipe_draw_vertex_state_func draw_vertex_state =
+ sctx->draw_vertex_state[!!sctx->shader.tes.cso]
+ [!!sctx->shader.gs.cso]
+ [sctx->ngg];
assert(draw_vbo);
- if (unlikely(sctx->real_draw_vbo))
+ assert(draw_vertex_state);
+
+ if (unlikely(sctx->real_draw_vbo)) {
+ assert(sctx->real_draw_vertex_state);
sctx->real_draw_vbo = draw_vbo;
- else
+ sctx->real_draw_vertex_state = draw_vertex_state;
+ } else {
+ assert(!sctx->real_draw_vertex_state);
sctx->b.draw_vbo = draw_vbo;
+ sctx->b.draw_vertex_state = draw_vertex_state;
+ }
}
/* Return the number of samples that the rasterizer uses. */
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 2f179e9195f..8b02e79437c 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -30,6 +30,7 @@
#include "util/format/u_format.h"
#include "util/format/u_format_s3tc.h"
#include "util/u_dual_blend.h"
+#include "util/u_helpers.h"
#include "util/u_memory.h"
#include "util/u_resource.h"
#include "util/u_upload_mgr.h"
@@ -636,14 +637,8 @@ static void *si_create_blend_state(struct pipe_context *ctx, const struct pipe_b
return si_create_blend_state_mode(ctx, state, V_028808_CB_NORMAL);
}
-static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
- const struct pipe_draw_info *info,
- unsigned drawid_offset,
- const struct pipe_draw_indirect_info *indirect,
- const struct pipe_draw_start_count_bias *draws,
- unsigned num_draws) {
- struct si_context *sctx = (struct si_context *)ctx;
-
+static bool si_check_blend_dst_sampler_noop(struct si_context *sctx)
+{
if (sctx->framebuffer.state.nr_cbufs == 1) {
struct si_shader_selector *sel = sctx->shader.ps.cso;
bool free_nir;
@@ -677,16 +672,44 @@ static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
if (tex->is_depth &&
tex->depth_cleared_level_mask & BITFIELD_BIT(samp->views[unit]->u.tex.first_level) &&
tex->depth_clear_value[0] == 1) {
- return;
+ return false;
}
/* TODO: handle color textures */
}
}
}
+ return true;
+}
+
+static void si_draw_blend_dst_sampler_noop(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws) {
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (!si_check_blend_dst_sampler_noop(sctx))
+ return;
+
sctx->real_draw_vbo(ctx, info, drawid_offset, indirect, draws, num_draws);
}
+static void si_draw_vstate_blend_dst_sampler_noop(struct pipe_context *ctx,
+ struct pipe_vertex_state *state,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws) {
+ struct si_context *sctx = (struct si_context *)ctx;
+
+ if (!si_check_blend_dst_sampler_noop(sctx))
+ return;
+
+ sctx->real_draw_vertex_state(ctx, state, partial_velem_mask, info, draws, num_draws);
+}
+
static void si_bind_blend_state(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
@@ -731,9 +754,10 @@ static void si_bind_blend_state(struct pipe_context *ctx, void *state)
if (likely(!radeon_uses_secure_bos(sctx->ws))) {
if (unlikely(blend->allows_noop_optimization)) {
- si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop);
+ si_install_draw_wrapper(sctx, si_draw_blend_dst_sampler_noop,
+ si_draw_vstate_blend_dst_sampler_noop);
} else {
- si_install_draw_wrapper(sctx, NULL);
+ si_install_draw_wrapper(sctx, NULL, NULL);
}
}
}
@@ -5011,6 +5035,78 @@ static void si_set_vertex_buffers(struct pipe_context *ctx, unsigned start_slot,
}
}
+static struct pipe_vertex_state *
+si_create_vertex_state(struct pipe_screen *screen,
+ struct pipe_vertex_buffer *buffer,
+ const struct pipe_vertex_element *elements,
+ unsigned num_elements,
+ struct pipe_resource *indexbuf,
+ uint32_t full_velem_mask)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+ struct si_vertex_state *state = CALLOC_STRUCT(si_vertex_state);
+
+ util_init_pipe_vertex_state(screen, buffer, elements, num_elements, indexbuf, full_velem_mask,
+ &state->b);
+
+ /* Initialize the vertex element state in state->element.
+ * Do it by creating a vertex element state object and copying it there.
+ */
+ struct pipe_context ctx = {};
+ ctx.screen = screen;
+ struct si_vertex_elements *velems = si_create_vertex_elements(&ctx, num_elements, elements);
+ state->velems = *velems;
+ si_delete_vertex_element(&ctx, velems);
+
+ assert(!state->velems.instance_divisor_is_one);
+ assert(!state->velems.instance_divisor_is_fetched);
+ assert(!state->velems.fix_fetch_always);
+ assert(buffer->stride % 4 == 0);
+ assert(buffer->buffer_offset % 4 == 0);
+ assert(!buffer->is_user_buffer);
+ for (unsigned i = 0; i < num_elements; i++) {
+ assert(elements[i].src_offset % 4 == 0);
+ assert(!elements[i].dual_slot);
+ }
+
+ for (unsigned i = 0; i < num_elements; i++) {
+ si_set_vertex_buffer_descriptor(sscreen, &state->velems, &state->b.input.vbuffer, i,
+ &state->descriptors[i * 4]);
+ }
+
+ return &state->b;
+}
+
+static void si_vertex_state_destroy(struct pipe_screen *screen,
+ struct pipe_vertex_state *state)
+{
+ pipe_vertex_buffer_unreference(&state->input.vbuffer);
+ pipe_resource_reference(&state->input.indexbuf, NULL);
+ FREE(state);
+}
+
+static struct pipe_vertex_state *
+si_pipe_create_vertex_state(struct pipe_screen *screen,
+ struct pipe_vertex_buffer *buffer,
+ const struct pipe_vertex_element *elements,
+ unsigned num_elements,
+ struct pipe_resource *indexbuf,
+ uint32_t full_velem_mask)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ return util_vertex_state_cache_get(screen, buffer, elements, num_elements, indexbuf,
+ full_velem_mask, &sscreen->vertex_state_cache);
+}
+
+static void si_pipe_vertex_state_destroy(struct pipe_screen *screen,
+ struct pipe_vertex_state *state)
+{
+ struct si_screen *sscreen = (struct si_screen *)screen;
+
+ util_vertex_state_destroy(screen, &sscreen->vertex_state_cache, state);
+}
+
/*
* Misc
*/
@@ -5177,12 +5273,17 @@ void si_init_state_functions(struct si_context *sctx)
void si_init_screen_state_functions(struct si_screen *sscreen)
{
sscreen->b.is_format_supported = si_is_format_supported;
+ sscreen->b.create_vertex_state = si_pipe_create_vertex_state;
+ sscreen->b.vertex_state_destroy = si_pipe_vertex_state_destroy;
if (sscreen->info.chip_class >= GFX10) {
sscreen->make_texture_descriptor = gfx10_make_texture_descriptor;
} else {
sscreen->make_texture_descriptor = si_make_texture_descriptor;
}
+
+ util_vertex_state_cache_init(&sscreen->vertex_state_cache,
+ si_create_vertex_state, si_vertex_state_destroy);
}
static void si_set_grbm_gfx_index(struct si_context *sctx, struct si_pm4_state *pm4, unsigned value)
diff --git a/src/gallium/drivers/radeonsi/si_state_draw.cpp b/src/gallium/drivers/radeonsi/si_state_draw.cpp
index 6382c34a598..3999fb4eac2 100644
--- a/src/gallium/drivers/radeonsi/si_state_draw.cpp
+++ b/src/gallium/drivers/radeonsi/si_state_draw.cpp
@@ -25,6 +25,7 @@
#include "ac_exp_param.h"
#include "ac_sqtt.h"
#include "si_build_pm4.h"
+#include "util/u_cpu_detect.h"
#include "util/u_index_modify.h"
#include "util/u_prim.h"
#include "util/u_upload_mgr.h"
@@ -944,6 +945,12 @@ static bool si_is_line_stipple_enabled(struct si_context *sctx)
(rs->polygon_mode_is_lines || util_prim_is_lines(sctx->current_rast_prim));
}
+enum si_is_draw_vertex_state {
+ DRAW_VERTEX_STATE_OFF,
+ DRAW_VERTEX_STATE_ON,
+};
+
+template <si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim,
unsigned min_vertex_count,
@@ -951,6 +958,9 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *
unsigned num_prims,
ubyte vertices_per_patch)
{
+ if (IS_DRAW_VERTEX_STATE)
+ return 0;
+
if (indirect) {
return indirect->buffer ||
(instance_count > 1 && indirect->count_from_stream_output);
@@ -960,7 +970,8 @@ static bool num_instanced_prims_less_than(const struct pipe_draw_indirect_info *
}
}
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned num_patches,
@@ -980,12 +991,15 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
}
key.u.prim = prim;
- key.u.uses_instancing = (indirect && indirect->buffer) || instance_count > 1;
+ key.u.uses_instancing = !IS_DRAW_VERTEX_STATE &&
+ ((indirect && indirect->buffer) || instance_count > 1);
key.u.multi_instances_smaller_than_primgroup =
- num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count,
- primgroup_size, sctx->patch_vertices);
- key.u.primitive_restart = primitive_restart;
- key.u.count_from_stream_output = indirect && indirect->count_from_stream_output;
+ num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
+ instance_count, primgroup_size,
+ sctx->patch_vertices);
+ key.u.primitive_restart = !IS_DRAW_VERTEX_STATE && primitive_restart;
+ key.u.count_from_stream_output = !IS_DRAW_VERTEX_STATE && indirect &&
+ indirect->count_from_stream_output;
key.u.line_stipple_enabled = si_is_line_stipple_enabled(sctx);
ia_multi_vgt_param =
@@ -1003,8 +1017,8 @@ static unsigned si_get_ia_multi_vgt_param(struct si_context *sctx,
*/
if (GFX_VERSION == GFX7 &&
sctx->family == CHIP_HAWAII && G_028AA8_SWITCH_ON_EOI(ia_multi_vgt_param) &&
- num_instanced_prims_less_than(indirect, prim, min_vertex_count, instance_count, 2,
- sctx->patch_vertices))
+ num_instanced_prims_less_than<IS_DRAW_VERTEX_STATE>(indirect, prim, min_vertex_count,
+ instance_count, 2, sctx->patch_vertices))
sctx->flags |= SI_CONTEXT_VGT_FLUSH;
}
@@ -1089,11 +1103,11 @@ static void si_emit_rasterizer_prim_state(struct si_context *sctx)
}
}
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static void si_emit_vs_state(struct si_context *sctx, unsigned index_size)
{
- if (sctx->num_vs_blit_sgprs) {
+ if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
/* Re-emit the state after we leave u_blitter. */
sctx->last_vs_state = ~0;
return;
@@ -1143,7 +1157,8 @@ static bool si_prim_restart_index_changed(struct si_context *sctx, bool primitiv
sctx->last_restart_index == SI_RESTART_INDEX_UNKNOWN);
}
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned num_patches,
@@ -1154,7 +1169,7 @@ static void si_emit_ia_multi_vgt_param(struct si_context *sctx,
unsigned ia_multi_vgt_param;
ia_multi_vgt_param =
- si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+ si_get_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
(sctx, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count);
@@ -1225,7 +1240,8 @@ static void gfx10_emit_ge_cntl(struct si_context *sctx, unsigned num_patches)
}
}
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static void si_emit_draw_registers(struct si_context *sctx,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned num_patches,
@@ -1234,10 +1250,13 @@ static void si_emit_draw_registers(struct si_context *sctx,
{
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
+ if (IS_DRAW_VERTEX_STATE)
+ primitive_restart = false;
+
if (GFX_VERSION >= GFX10)
gfx10_emit_ge_cntl<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, num_patches);
else
- si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS>
+ si_emit_ia_multi_vgt_param<GFX_VERSION, HAS_TESS, HAS_GS, IS_DRAW_VERTEX_STATE>
(sctx, indirect, prim, num_patches, instance_count, primitive_restart,
min_vertex_count);
@@ -1284,7 +1303,7 @@ static void si_emit_draw_registers(struct si_context *sctx,
} \
} while (0)
-template <chip_class GFX_VERSION, si_has_ngg NGG>
+template <chip_class GFX_VERSION, si_has_ngg NGG, si_is_draw_vertex_state IS_DRAW_VERTEX_STATE>
ALWAYS_INLINE
static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw_info *info,
unsigned drawid_base,
@@ -1304,7 +1323,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
uint32_t use_opaque = 0;
- if (indirect && indirect->count_from_stream_output) {
+ if (!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) {
struct si_streamout_target *t = (struct si_streamout_target *)indirect->count_from_stream_output;
radeon_begin(cs);
@@ -1379,7 +1398,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
unsigned sh_base_reg = sctx->shader_pointers.sh_base[PIPE_SHADER_VERTEX];
bool render_cond_bit = sctx->render_cond_enabled;
- if (indirect) {
+ if (!IS_DRAW_VERTEX_STATE && indirect) {
assert(num_draws == 1);
uint64_t indirect_va = si_resource(indirect->buffer)->gpu_address;
@@ -1454,10 +1473,10 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
/* Base vertex and start instance. */
int base_vertex = original_index_size ? draws[0].index_bias : draws[0].start;
- bool set_draw_id = sctx->vs_uses_draw_id;
+ bool set_draw_id = !IS_DRAW_VERTEX_STATE && sctx->vs_uses_draw_id;
bool set_base_instance = sctx->vs_uses_base_instance;
- if (sctx->num_vs_blit_sgprs) {
+ if (!IS_DRAW_VERTEX_STATE && sctx->num_vs_blit_sgprs) {
/* Re-emit draw constants after we leave u_blitter. */
si_invalidate_draw_sh_constants(sctx);
@@ -1496,7 +1515,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
}
/* Don't update draw_id in the following code if it doesn't increment. */
- bool increment_draw_id = num_draws > 1 && set_draw_id && info->increment_draw_id;
+ bool increment_draw_id = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
+ set_draw_id && info->increment_draw_id;
if (index_size) {
/* NOT_EOP allows merging multiple draws into 1 wave, but only user VGPRs
@@ -1514,7 +1534,8 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
* else for (all draws);
*
*/
- bool index_bias_varies = num_draws > 1 && info->index_bias_varies;
+ bool index_bias_varies = !IS_DRAW_VERTEX_STATE && num_draws > 1 &&
+ info->index_bias_varies;
if (increment_draw_id) {
if (index_bias_varies) {
@@ -1655,7 +1676,7 @@ static void si_emit_draw_packets(struct si_context *sctx, const struct pipe_draw
radeon_emit(draws[i].count);
radeon_emit(V_0287F0_DI_SRC_SEL_AUTO_INDEX | use_opaque);
}
- if (num_draws > 1 && !sctx->num_vs_blit_sgprs)
+ if (num_draws > 1 && (IS_DRAW_VERTEX_STATE || !sctx->num_vs_blit_sgprs))
sctx->last_base_vertex = draws[num_draws - 1].start;
}
}
@@ -1743,20 +1764,56 @@ void si_set_vertex_buffer_descriptor(struct si_screen *sscreen, struct si_vertex
#endif
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG> ALWAYS_INLINE
-static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
+/* util_bitcount has large measurable overhead (~2% difference in viewperf), so we use
+ * the POPCNT x86 instruction via inline assembly if the CPU supports it.
+ */
+enum si_has_popcnt {
+ POPCNT_NO,
+ POPCNT_YES,
+};
+
+template<si_has_popcnt POPCNT>
+unsigned bitcount_asm(unsigned n)
+{
+ if (POPCNT == POPCNT_YES)
+ return util_popcnt_inline_asm(n);
+ else
+ return util_bitcount(n);
+}
+
+template<si_has_popcnt POPCNT>
+static ALWAYS_INLINE unsigned get_next_vertex_state_elem(struct pipe_vertex_state *state,
+ uint32_t *partial_velem_mask)
{
- unsigned count = sctx->num_vertex_elements;
+ unsigned semantic_index = u_bit_scan(partial_velem_mask);
+ assert(state->input.full_velem_mask & BITFIELD_BIT(semantic_index));
+ /* A prefix mask of the full mask gives us the index in pipe_vertex_state. */
+ return bitcount_asm<POPCNT>(state->input.full_velem_mask & BITFIELD_MASK(semantic_index));
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx,
+ struct pipe_vertex_state *state,
+ uint32_t partial_velem_mask)
+{
+ struct si_vertex_state *vstate = (struct si_vertex_state *)state;
+ unsigned count = IS_DRAW_VERTEX_STATE ? bitcount_asm<POPCNT>(partial_velem_mask) :
+ sctx->num_vertex_elements;
+ unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
+ PIPE_SHADER_VERTEX);
unsigned num_vbos_in_user_sgprs = si_num_vbos_in_user_sgprs_inline(GFX_VERSION);
bool pointer_dirty, user_sgprs_dirty;
assert(count <= SI_MAX_ATTRIBS);
- if (sctx->vertex_buffers_dirty) {
+ if (sctx->vertex_buffers_dirty || IS_DRAW_VERTEX_STATE) {
assert(count);
struct si_vertex_elements *velems = sctx->vertex_elements;
- unsigned alloc_size = velems->vb_desc_list_alloc_size;
+ unsigned alloc_size = IS_DRAW_VERTEX_STATE ?
+ vstate->velems.vb_desc_list_alloc_size :
+ velems->vb_desc_list_alloc_size;
uint32_t *ptr;
if (alloc_size) {
@@ -1783,27 +1840,64 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
si_resource_reference(&sctx->vb_descriptors_buffer, NULL);
}
- unsigned first_vb_use_mask = velems->first_vb_use_mask;
+ if (IS_DRAW_VERTEX_STATE) {
+ unsigned partial_count = bitcount_asm<POPCNT>(partial_velem_mask);
+ unsigned i = 0;
- for (unsigned i = 0; i < count; i++) {
- unsigned vbo_index = velems->vertex_buffer_index[i];
- struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
- uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
- : &ptr[(i - num_vbos_in_user_sgprs) * 4];
+ if (num_vbos_in_user_sgprs) {
+ unsigned num_vb_sgprs = MIN2(partial_count, num_vbos_in_user_sgprs) * 4;
- if (!si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc))
- continue;
+ radeon_begin(&sctx->gfx_cs);
+ radeon_set_sh_reg_seq(sh_base + SI_SGPR_VS_VB_DESCRIPTOR_FIRST * 4, num_vb_sgprs);
- if (first_vb_use_mask & (1 << i)) {
- radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),
+ for (; partial_velem_mask && i < num_vbos_in_user_sgprs; i++) {
+ unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
+
+ radeon_emit_array(&vstate->descriptors[velem_index * 4], 4);
+ }
+ radeon_end();
+ }
+
+ for (; partial_velem_mask; i++) {
+ unsigned velem_index = get_next_vertex_state_elem<POPCNT>(state, &partial_velem_mask);
+ uint32_t *desc = &ptr[(i - num_vbos_in_user_sgprs) * 4];
+
+ memcpy(desc, &vstate->descriptors[velem_index * 4], 16);
+ }
+
+ if (vstate->b.input.vbuffer.buffer.resource != vstate->b.input.indexbuf) {
+ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs,
+ si_resource(vstate->b.input.vbuffer.buffer.resource),
RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
}
- }
- sctx->vertex_buffers_dirty = false;
+ /* The next draw_vbo should recompute and rebind vertex buffer descriptors. */
+ sctx->vertex_buffers_dirty = sctx->num_vertex_elements > 0;
+
+ user_sgprs_dirty = false; /* We just set them above. */
+ pointer_dirty = count > num_vbos_in_user_sgprs;
+ } else {
+ unsigned first_vb_use_mask = velems->first_vb_use_mask;
+
+ for (unsigned i = 0; i < count; i++) {
+ unsigned vbo_index = velems->vertex_buffer_index[i];
+ struct pipe_vertex_buffer *vb = &sctx->vertex_buffer[vbo_index];
+ uint32_t *desc = i < num_vbos_in_user_sgprs ? &sctx->vb_descriptor_user_sgprs[i * 4]
+ : &ptr[(i - num_vbos_in_user_sgprs) * 4];
- pointer_dirty = alloc_size != 0;
- user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+ if (!si_set_vb_descriptor<GFX_VERSION>(velems, vb, i, desc))
+ continue;
+
+ if (first_vb_use_mask & (1 << i)) {
+ radeon_add_to_buffer_list(sctx, &sctx->gfx_cs, si_resource(vb->buffer.resource),
+ RADEON_USAGE_READ, RADEON_PRIO_VERTEX_BUFFER);
+ }
+ }
+
+ sctx->vertex_buffers_dirty = false;
+ user_sgprs_dirty = num_vbos_in_user_sgprs > 0;
+ pointer_dirty = alloc_size != 0;
+ }
} else {
pointer_dirty = sctx->vertex_buffer_pointer_dirty;
user_sgprs_dirty = sctx->vertex_buffer_user_sgprs_dirty;
@@ -1811,8 +1905,6 @@ static bool si_upload_and_prefetch_VB_descriptors(struct si_context *sctx)
if (pointer_dirty || user_sgprs_dirty) {
struct radeon_cmdbuf *cs = &sctx->gfx_cs;
- unsigned sh_base = si_get_user_data_base(GFX_VERSION, HAS_TESS, HAS_GS, NGG,
- PIPE_SHADER_VERTEX);
assert(count);
radeon_begin(cs);
@@ -1922,8 +2014,8 @@ static void si_get_draw_start_count(struct si_context *sctx, const struct pipe_d
}
}
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
-ALWAYS_INLINE
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE> ALWAYS_INLINE
static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
enum pipe_prim_type prim, unsigned instance_count,
@@ -1964,8 +2056,8 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
}
/* Emit draw states. */
- si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx, info->index_size);
- si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+ si_emit_vs_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>(sctx, info->index_size);
+ si_emit_draw_registers<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
(sctx, indirect, prim, num_patches, instance_count, primitive_restart,
info->restart_index, min_vertex_count);
}
@@ -1975,13 +2067,16 @@ static void si_emit_all_states(struct si_context *sctx, const struct pipe_draw_i
pipe_resource_reference(&indexbuf, NULL); \
} while (0)
-template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
-static void si_draw_vbo(struct pipe_context *ctx,
- const struct pipe_draw_info *info,
- unsigned drawid_offset,
- const struct pipe_draw_indirect_info *indirect,
- const struct pipe_draw_start_count_bias *draws,
- unsigned num_draws)
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_is_draw_vertex_state IS_DRAW_VERTEX_STATE, si_has_popcnt POPCNT> ALWAYS_INLINE
+static void si_draw(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws,
+ struct pipe_vertex_state *state,
+ uint32_t partial_velem_mask)
{
/* Keep code that uses the least number of local variables as close to the beginning
* of this function as possible to minimize register pressure.
@@ -2052,11 +2147,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
* 'instance_count == 0' seems to be problematic on Renoir chips (#4866),
* so simplify the condition and drop these draws for all <= GFX9 chips.
*/
- if (GFX_VERSION <= GFX9 && unlikely(!indirect && !instance_count))
+ if (GFX_VERSION <= GFX9 && unlikely(!IS_DRAW_VERTEX_STATE && !indirect && !instance_count))
return;
struct si_shader_selector *vs = sctx->shader.vs.cso;
- if (unlikely(!vs || sctx->num_vertex_elements < vs->num_vs_inputs ||
+ struct si_vertex_state *vstate = (struct si_vertex_state *)state;
+ if (unlikely(!vs ||
+ (!IS_DRAW_VERTEX_STATE && sctx->num_vertex_elements < vs->num_vs_inputs) ||
+ (IS_DRAW_VERTEX_STATE && vstate->velems.count < vs->num_vs_inputs) ||
!sctx->shader.ps.cso || (HAS_TESS != (prim == PIPE_PRIM_PATCHES)))) {
assert(0);
return;
@@ -2084,7 +2182,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (index_size) {
/* Translate or upload, if needed. */
/* 8-bit indices are supported on GFX8. */
- if (GFX_VERSION <= GFX7 && index_size == 1) {
+ if (!IS_DRAW_VERTEX_STATE && GFX_VERSION <= GFX7 && index_size == 1) {
unsigned start, count, start_offset, size, offset;
void *ptr;
@@ -2103,7 +2201,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* info->start will be added by the drawing code */
index_offset = offset - start_offset;
index_size = 2;
- } else if (info->has_user_indices) {
+ } else if (!IS_DRAW_VERTEX_STATE && info->has_user_indices) {
unsigned start_offset;
assert(!indirect);
@@ -2130,7 +2228,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
unsigned min_direct_count = 0;
unsigned total_direct_count = 0;
- if (indirect) {
+ if (!IS_DRAW_VERTEX_STATE && indirect) {
/* Add the buffer size for memory checking in need_cs_space. */
if (indirect->buffer)
si_context_add_resource_size(sctx, indirect->buffer);
@@ -2192,6 +2290,32 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
}
+ if (IS_DRAW_VERTEX_STATE) {
+ /* draw_vertex_state doesn't use the current vertex buffers and vertex elements,
+ * so disable any non-trivial VS prolog that is based on them, such as vertex
+ * format lowering.
+ */
+ if (!sctx->force_trivial_vs_prolog) {
+ sctx->force_trivial_vs_prolog = true;
+
+ /* Update shaders to disable the non-trivial VS prolog. */
+ if (sctx->uses_nontrivial_vs_prolog) {
+ si_vs_key_update_inputs(sctx);
+ sctx->do_update_shaders = true;
+ }
+ }
+ } else {
+ if (sctx->force_trivial_vs_prolog) {
+ sctx->force_trivial_vs_prolog = false;
+
+ /* Update shaders to enable the non-trivial VS prolog. */
+ if (sctx->uses_nontrivial_vs_prolog) {
+ si_vs_key_update_inputs(sctx);
+ sctx->do_update_shaders = true;
+ }
+ }
+ }
+
/* Update NGG culling settings. */
uint8_t old_ngg_culling = sctx->ngg_culling;
if (GFX_VERSION >= GFX10) {
@@ -2314,7 +2438,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.scissors);
gfx9_scissor_bug = true;
- if ((indirect && indirect->count_from_stream_output) ||
+ if ((!IS_DRAW_VERTEX_STATE && indirect && indirect->count_from_stream_output) ||
sctx->dirty_atoms & si_atoms_that_always_roll_context() ||
sctx->dirty_states & si_states_that_always_roll_context())
sctx->context_roll = true;
@@ -2333,7 +2457,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
masked_atoms |= si_get_atom_bit(sctx, &sctx->atoms.s.render_cond);
/* Emit all states except possibly render condition. */
- si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+ si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
sctx->emit_cache_flush(sctx, &sctx->gfx_cs);
@@ -2342,7 +2466,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
* It should done after cache flushing.
*/
- if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+ if (unlikely((!si_upload_and_prefetch_VB_descriptors
+ <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
+ (sctx, state, partial_velem_mask)))) {
DRAW_CLEANUP;
return;
}
@@ -2359,7 +2485,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
assert(sctx->dirty_atoms == 0);
- si_emit_draw_packets<GFX_VERSION, NGG>
+ si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
index_size, index_offset, instance_count, original_index_size);
/* <-- CUs are busy here. */
@@ -2381,12 +2507,14 @@ static void si_draw_vbo(struct pipe_context *ctx,
/* This uploads VBO descriptors, sets user SGPRs, and executes the L2 prefetch.
* It should done after cache flushing and after the VS prefetch.
*/
- if (unlikely((!si_upload_and_prefetch_VB_descriptors<GFX_VERSION, HAS_TESS, HAS_GS, NGG>(sctx)))) {
+ if (unlikely((!si_upload_and_prefetch_VB_descriptors
+ <GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE, POPCNT>
+ (sctx, state, partial_velem_mask)))) {
DRAW_CLEANUP;
return;
}
- si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG>
+ si_emit_all_states<GFX_VERSION, HAS_TESS, HAS_GS, NGG, IS_DRAW_VERTEX_STATE>
(sctx, info, indirect, prim, instance_count, min_direct_count,
primitive_restart, masked_atoms);
@@ -2397,7 +2525,7 @@ static void si_draw_vbo(struct pipe_context *ctx,
}
assert(sctx->dirty_atoms == 0);
- si_emit_draw_packets<GFX_VERSION, NGG>
+ si_emit_draw_packets<GFX_VERSION, NGG, IS_DRAW_VERTEX_STATE>
(sctx, info, drawid_offset, indirect, draws, num_draws, total_direct_count, indexbuf,
index_size, index_offset, instance_count, original_index_size);
@@ -2428,9 +2556,9 @@ static void si_draw_vbo(struct pipe_context *ctx,
if (unlikely(sctx->decompression_enabled)) {
sctx->num_decompress_calls++;
} else {
- sctx->num_draw_calls++;
+ sctx->num_draw_calls += num_draws;
if (primitive_restart)
- sctx->num_prim_restart_calls++;
+ sctx->num_prim_restart_calls += num_draws;
}
if (!sctx->blitter_running && sctx->framebuffer.state.zsbuf) {
@@ -2441,6 +2569,39 @@ static void si_draw_vbo(struct pipe_context *ctx,
DRAW_CLEANUP;
}
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG>
+static void si_draw_vbo(struct pipe_context *ctx,
+ const struct pipe_draw_info *info,
+ unsigned drawid_offset,
+ const struct pipe_draw_indirect_info *indirect,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws)
+{
+ si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_OFF, POPCNT_NO>
+ (ctx, info, drawid_offset, indirect, draws, num_draws, NULL, 0);
+}
+
+template <chip_class GFX_VERSION, si_has_tess HAS_TESS, si_has_gs HAS_GS, si_has_ngg NGG,
+ si_has_popcnt POPCNT>
+static void si_draw_vertex_state(struct pipe_context *ctx,
+ struct pipe_vertex_state *vstate,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws)
+{
+ struct si_vertex_state *state = (struct si_vertex_state *)vstate;
+ struct pipe_draw_info dinfo = {};
+
+ dinfo.mode = info.mode;
+ dinfo.index_size = 4;
+ dinfo.instance_count = 1;
+ dinfo.index.resource = state->b.input.indexbuf;
+
+ si_draw<GFX_VERSION, HAS_TESS, HAS_GS, NGG, DRAW_VERTEX_STATE_ON, POPCNT>
+ (ctx, &dinfo, 0, NULL, draws, num_draws, vstate, partial_velem_mask);
+}
+
static void si_draw_rectangle(struct blitter_context *blitter, void *vertex_elements_cso,
blitter_get_vs_func get_vs, int x1, int y1, int x2, int y2,
float depth, unsigned num_instances, enum blitter_attrib_type type,
@@ -2492,6 +2653,14 @@ static void si_init_draw_vbo(struct si_context *sctx)
sctx->draw_vbo[HAS_TESS][HAS_GS][NGG] =
si_draw_vbo<GFX_VERSION, HAS_TESS, HAS_GS, NGG>;
+
+ if (util_get_cpu_caps()->has_popcnt) {
+ sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
+ si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_YES>;
+ } else {
+ sctx->draw_vertex_state[HAS_TESS][HAS_GS][NGG] =
+ si_draw_vertex_state<GFX_VERSION, HAS_TESS, HAS_GS, NGG, POPCNT_NO>;
+ }
}
template <chip_class GFX_VERSION>
@@ -2517,6 +2686,16 @@ static void si_invalid_draw_vbo(struct pipe_context *pipe,
unreachable("vertex shader not bound");
}
+static void si_invalid_draw_vertex_state(struct pipe_context *ctx,
+ struct pipe_vertex_state *vstate,
+ uint32_t partial_velem_mask,
+ struct pipe_draw_vertex_state_info info,
+ const struct pipe_draw_start_count_bias *draws,
+ unsigned num_draws)
+{
+ unreachable("vertex shader not bound");
+}
+
extern "C"
void GFX(si_init_draw_functions_)(struct si_context *sctx)
{
@@ -2528,6 +2707,7 @@ void GFX(si_init_draw_functions_)(struct si_context *sctx)
* initialization of callbacks in upper layers (such as u_threaded_context).
*/
sctx->b.draw_vbo = si_invalid_draw_vbo;
+ sctx->b.draw_vertex_state = si_invalid_draw_vertex_state;
sctx->blitter->draw_rectangle = si_draw_rectangle;
si_init_ia_multi_vgt_param_table(sctx);
diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
index 2414d52a7fd..b0cf1d1b4eb 100644
--- a/src/gallium/drivers/radeonsi/si_state_shaders.c
+++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
@@ -1816,9 +1816,15 @@ void si_vs_key_update_inputs(struct si_context *sctx)
if (vs->info.base.vs.blit_sgprs_amd) {
si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
key->opt.prefer_mono = 0;
+ sctx->uses_nontrivial_vs_prolog = false;
return;
}
+ bool uses_nontrivial_vs_prolog = false;
+
+ if (elts->instance_divisor_is_one || elts->instance_divisor_is_fetched)
+ uses_nontrivial_vs_prolog = true;
+
key->part.vs.prolog.instance_divisor_is_one = elts->instance_divisor_is_one;
key->part.vs.prolog.instance_divisor_is_fetched = elts->instance_divisor_is_fetched;
key->opt.prefer_mono = elts->instance_divisor_is_fetched;
@@ -1846,9 +1852,29 @@ void si_vs_key_update_inputs(struct si_context *sctx)
while (fix) {
unsigned i = u_bit_scan(&fix);
- key->mono.vs_fix_fetch[i].bits = elts->fix_fetch[i];
+ uint8_t fix_fetch = elts->fix_fetch[i];
+
+ key->mono.vs_fix_fetch[i].bits = fix_fetch;
+ if (fix_fetch)
+ uses_nontrivial_vs_prolog = true;
}
key->mono.vs_fetch_opencode = opencode;
+ if (opencode)
+ uses_nontrivial_vs_prolog = true;
+
+ sctx->uses_nontrivial_vs_prolog = uses_nontrivial_vs_prolog;
+
+ /* draw_vertex_state (display lists) requires a trivial VS prolog that ignores
+ * the current vertex buffers and vertex elements.
+ *
+ * We just computed the prolog key because we needed to set uses_nontrivial_vs_prolog,
+ * so that we know whether the VS prolog should be updated when we switch from
+ * draw_vertex_state to draw_vbo. Now clear the VS prolog for draw_vertex_state.
+ * This should happen rarely because the VS prolog should be trivial in most
+ * cases.
+ */
+ if (uses_nontrivial_vs_prolog && sctx->force_trivial_vs_prolog)
+ si_clear_vs_key_inputs(sctx, key, &key->part.vs.prolog);
}
void si_get_vs_key_inputs(struct si_context *sctx, struct si_shader_key *key,
More information about the mesa-commit
mailing list