[Mesa-dev] [PATCH 31/31] radeonsi: generate GS prolog to (partially) fix triangle strip adjacency rotation
Marek Olšák
maraeo at gmail.com
Wed Nov 2 21:11:29 UTC 2016
On Mon, Oct 31, 2016 at 11:11 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Fixes GL45-CTS.geometry_shader.adjacency.adjacency_indiced_triangle_strip and
> others.
>
> This leaves the case of triangle strips with adjacency and primitive restarts
> open. It seems that the only thing that cares about that is a piglit test.
> Fixing this efficiently would be really involved, and I don't want to use the
> hammer of degrading to software handling of indices because there may well
> be software that uses this draw mode (without caring about the precise
> rotation of triangles).
> ---
> src/gallium/drivers/radeonsi/si_pipe.c | 1 +
> src/gallium/drivers/radeonsi/si_pipe.h | 2 +
> src/gallium/drivers/radeonsi/si_shader.c | 112 ++++++++++++++++++++++++
> src/gallium/drivers/radeonsi/si_shader.h | 10 +++
> src/gallium/drivers/radeonsi/si_state_draw.c | 18 ++++
> src/gallium/drivers/radeonsi/si_state_shaders.c | 7 +-
> 6 files changed, 146 insertions(+), 4 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index bf3b442..bc633bb 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -665,20 +665,21 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
> return 0;
> }
>
> static void si_destroy_screen(struct pipe_screen* pscreen)
> {
> struct si_screen *sscreen = (struct si_screen *)pscreen;
> struct si_shader_part *parts[] = {
> sscreen->vs_prologs,
> sscreen->vs_epilogs,
> sscreen->tcs_epilogs,
> + sscreen->gs_prologs,
> sscreen->ps_prologs,
> sscreen->ps_epilogs
> };
> unsigned i;
>
> if (!sscreen)
> return;
>
> if (!sscreen->b.ws->unref(sscreen->b.ws))
> return;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index e7617bc..8e6a94d 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -89,20 +89,21 @@ struct si_screen {
> bool has_ds_bpermute;
>
> /* Whether shaders are monolithic (1-part) or separate (3-part). */
> bool use_monolithic_shaders;
> bool record_llvm_ir;
>
> pipe_mutex shader_parts_mutex;
> struct si_shader_part *vs_prologs;
> struct si_shader_part *vs_epilogs;
> struct si_shader_part *tcs_epilogs;
> + struct si_shader_part *gs_prologs;
> struct si_shader_part *ps_prologs;
> struct si_shader_part *ps_epilogs;
>
> /* Shader cache in memory.
> *
> * Design & limitations:
> * - The shader cache is per screen (= per process), never saved to
> * disk, and skips redundant shader compilations from TGSI to bytecode.
> * - It can only be used with one-variant-per-shader support, in which
> * case only the main (typically middle) part of shaders is cached.
> @@ -312,20 +313,21 @@ struct si_context {
> int last_sh_base_reg;
> int last_primitive_restart_en;
> int last_restart_index;
> int last_gs_out_prim;
> int last_prim;
> int last_multi_vgt_param;
> int last_rast_prim;
> unsigned last_sc_line_stipple;
> int last_vtx_reuse_depth;
> int current_rast_prim; /* primitive type after TES, GS */
> + bool gs_tri_strip_adj_fix;
> unsigned last_gsvs_itemsize;
>
> /* Scratch buffer */
> struct r600_resource *scratch_buffer;
> bool emit_scratch_reloc;
> unsigned scratch_waves;
> unsigned spi_tmpring_size;
>
> struct r600_resource *compute_scratch_buffer;
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index fe15420..9141d62 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -6740,20 +6740,92 @@ static void si_get_ps_epilog_key(struct si_shader *shader,
> struct tgsi_shader_info *info = &shader->selector->info;
> memset(key, 0, sizeof(*key));
> key->ps_epilog.colors_written = info->colors_written;
> key->ps_epilog.writes_z = info->writes_z;
> key->ps_epilog.writes_stencil = info->writes_stencil;
> key->ps_epilog.writes_samplemask = info->writes_samplemask;
> key->ps_epilog.states = shader->key.ps.epilog;
> }
>
> /**
> + * Build the GS prolog function. Rotate the input vertices for triangle strips
> + * with adjacency.
> + */
> +static void si_build_gs_prolog_function(struct si_shader_context *ctx,
> + union si_shader_part_key *key)
> +{
> + const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
> + const unsigned num_vgprs = 8;
> + struct gallivm_state *gallivm = &ctx->gallivm;
> + LLVMBuilderRef builder = gallivm->builder;
> + LLVMTypeRef params[32];
> + LLVMTypeRef returns[32];
> + LLVMValueRef func, ret;
> +
> + for (unsigned i = 0; i < num_sgprs; ++i) {
> + params[i] = ctx->i32;
> + returns[i] = ctx->i32;
> + }
> +
> + for (unsigned i = 0; i < num_vgprs; ++i) {
> + params[num_sgprs + i] = ctx->i32;
> + returns[num_sgprs + i] = ctx->f32;
> + }
> +
> + /* Create the function. */
> + si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
> + params, num_sgprs + num_vgprs, num_sgprs - 1);
> + func = ctx->main_fn;
> +
> + /* Copy inputs to outputs. This should be no-op, as the registers match,
> + * but it will prevent the compiler from overwriting them unintentionally.
> + */
> + ret = ctx->return_value;
> + for (unsigned i = 0; i < num_sgprs; i++) {
> + LLVMValueRef p = LLVMGetParam(func, i);
> + ret = LLVMBuildInsertValue(builder, ret, p, i, "");
> + }
> + for (unsigned i = 0; i < num_vgprs; i++) {
> + LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
> + p = LLVMBuildBitCast(builder, p, ctx->f32, "");
> + ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
> + }
> +
> + if (key->gs_prolog.states.tri_strip_adj_fix) {
> + /* Remap the input vertices for every other primitive. */
> + const unsigned vtx_params[6] = {
> + num_sgprs,
> + num_sgprs + 1,
> + num_sgprs + 3,
> + num_sgprs + 4,
> + num_sgprs + 5,
> + num_sgprs + 6
> + };
> + LLVMValueRef prim_id, rotate;
> +
> + prim_id = LLVMGetParam(func, num_sgprs + 2);
> + rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
> +
> + for (unsigned i = 0; i < 6; ++i) {
> + LLVMValueRef base, rotated, actual;
> + base = LLVMGetParam(func, vtx_params[i]);
> + rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
> + actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
> + actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
> + ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
> + }
> + }
> +
> + LLVMBuildRet(builder, ret);
> +}
> +
> +/**
> * Given a list of shader part functions, build a wrapper function that
> * runs them in sequence to form a monolithic shader.
> */
> static void si_build_wrapper_function(struct si_shader_context *ctx,
> LLVMValueRef *parts,
> unsigned num_parts,
> unsigned main_part)
> {
> struct gallivm_state *gallivm = &ctx->gallivm;
> LLVMBuilderRef builder = ctx->gallivm.builder;
> @@ -7012,20 +7084,32 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
> LLVMValueRef parts[2];
> union si_shader_part_key epilog_key;
>
> parts[0] = ctx.main_fn;
>
> si_get_vs_epilog_key(shader, &shader->key.tes.epilog, &epilog_key);
> si_build_vs_epilog_function(&ctx, &epilog_key);
> parts[1] = ctx.main_fn;
>
> si_build_wrapper_function(&ctx, parts, 2, 0);
> + } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
> + LLVMValueRef parts[2];
> + union si_shader_part_key prolog_key;
> +
> + parts[1] = ctx.main_fn;
> +
> + memset(&prolog_key, 0, sizeof(prolog_key));
> + prolog_key.gs_prolog.states = shader->key.gs.prolog;
> + si_build_gs_prolog_function(&ctx, &prolog_key);
> + parts[0] = ctx.main_fn;
> +
> + si_build_wrapper_function(&ctx, parts, 2, 1);
> } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
> LLVMValueRef parts[3];
> union si_shader_part_key prolog_key;
> union si_shader_part_key epilog_key;
> bool need_prolog;
>
> si_get_ps_prolog_key(shader, &prolog_key, false);
> need_prolog = si_need_ps_prolog(&prolog_key);
>
> parts[need_prolog ? 1 : 0] = ctx.main_fn;
> @@ -7200,20 +7284,23 @@ si_get_shader_part(struct si_screen *sscreen,
> si_init_shader_ctx(&ctx, sscreen, &shader, tm);
> ctx.type = type;
>
> switch (type) {
> case PIPE_SHADER_VERTEX:
> break;
> case PIPE_SHADER_TESS_CTRL:
> assert(!prolog);
> shader.key.tcs.epilog = key->tcs_epilog.states;
> break;
> + case PIPE_SHADER_GEOMETRY:
> + assert(prolog);
> + break;
> case PIPE_SHADER_FRAGMENT:
> if (prolog)
> shader.key.ps.prolog = key->ps_prolog.states;
> else
> shader.key.ps.epilog = key->ps_epilog.states;
> break;
> default:
> unreachable("bad shader part");
> }
>
> @@ -7524,20 +7611,41 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
>
> shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
> PIPE_SHADER_TESS_CTRL, false,
> &epilog_key, tm, debug,
> si_build_tcs_epilog_function,
> "Tessellation Control Shader Epilog");
> return shader->epilog != NULL;
> }
>
> /**
> + * Select and compile (or reuse) GS parts (prolog).
> + */
> +static bool si_shader_select_gs_parts(struct si_screen *sscreen,
> + LLVMTargetMachineRef tm,
> + struct si_shader *shader,
> + struct pipe_debug_callback *debug)
> +{
> + union si_shader_part_key prolog_key;
> +
> + memset(&prolog_key, 0, sizeof(prolog_key));
> + prolog_key.gs_prolog.states = shader->key.gs.prolog;
> +
If the workaround is disabled, you can do return here and skip
building an empty prolog.
> + shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
> + PIPE_SHADER_GEOMETRY, true,
> + &prolog_key, tm, debug,
> + si_build_gs_prolog_function,
> + "Geometry Shader Prolog");
> + return shader->prolog != NULL;
> +}
> +
> +/**
> * Build the pixel shader prolog function. This handles:
> * - two-side color selection and interpolation
> * - overriding interpolation parameters for the API PS
> * - polygon stippling
> *
> * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
> * overriden by other states. (e.g. per-sample interpolation)
> * Interpolated colors are stored after the preloaded VGPRs.
> */
> static void si_build_ps_prolog_function(struct si_shader_context *ctx,
> @@ -8040,20 +8148,24 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
> return -1;
> break;
> case PIPE_SHADER_TESS_CTRL:
> if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
> return -1;
> break;
> case PIPE_SHADER_TESS_EVAL:
> if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
> return -1;
> break;
> + case PIPE_SHADER_GEOMETRY:
> + if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
> + return - 1;
Unnecessary space between - and 1.
> + break;
> case PIPE_SHADER_FRAGMENT:
> if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
> return -1;
>
> /* Make sure we have at least as many VGPRs as there
> * are allocated inputs.
> */
> shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
> shader->info.num_input_vgprs);
> break;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 91f9cbf..d8ab2a4 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -318,20 +318,24 @@ struct si_vs_epilog_bits {
> * if PS doesn't read them
> */
> };
>
> /* Common TCS bits between the shader key and the epilog key. */
> struct si_tcs_epilog_bits {
> unsigned prim_mode:3;
> uint64_t inputs_to_copy;
> };
>
> +struct si_gs_prolog_bits {
> + unsigned tri_strip_adj_fix:1;
> +};
> +
> /* Common PS bits between the shader key and the prolog key. */
> struct si_ps_prolog_bits {
> unsigned color_two_side:1;
> unsigned flatshade_colors:1;
> unsigned poly_stipple:1;
> unsigned force_persp_sample_interp:1;
> unsigned force_linear_sample_interp:1;
> unsigned force_persp_center_interp:1;
> unsigned force_linear_center_interp:1;
> unsigned bc_optimize_for_persp:1;
> @@ -356,20 +360,23 @@ union si_shader_part_key {
> unsigned last_input:4;
> } vs_prolog;
> struct {
> struct si_vs_epilog_bits states;
> unsigned prim_id_param_offset:5;
> } vs_epilog;
> struct {
> struct si_tcs_epilog_bits states;
> } tcs_epilog;
> struct {
> + struct si_gs_prolog_bits states;
> + } gs_prolog;
> + struct {
> struct si_ps_prolog_bits states;
> unsigned num_input_sgprs:5;
> unsigned num_input_vgprs:5;
> /* Color interpolation and two-side color selection. */
> unsigned colors_read:8; /* color input components read */
> unsigned num_interp_inputs:5; /* BCOLOR is at this location */
> unsigned face_vgpr_index:5;
> unsigned wqm:1;
> char color_attr_index[2];
> char color_interp_vgpr_index[2]; /* -1 == constant */
> @@ -394,20 +401,23 @@ union si_shader_key {
> unsigned as_es:1; /* export shader */
> unsigned as_ls:1; /* local shader */
> } vs;
> struct {
> struct si_tcs_epilog_bits epilog;
> } tcs; /* tessellation control shader */
> struct {
> struct si_vs_epilog_bits epilog; /* same as VS */
> unsigned as_es:1; /* export shader */
> } tes; /* tessellation evaluation shader */
> + struct {
> + struct si_gs_prolog_bits prolog;
> + } gs;
> };
>
> struct si_shader_config {
> unsigned num_sgprs;
> unsigned num_vgprs;
> unsigned spilled_sgprs;
> unsigned spilled_vgprs;
> unsigned lds_size;
> unsigned spi_ps_input_ena;
> unsigned spi_ps_input_addr;
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index c0e2642..b934100 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -975,20 +975,38 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
> else if (sctx->tes_shader.cso)
> rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
> else
> rast_prim = info->mode;
>
> if (rast_prim != sctx->current_rast_prim) {
> sctx->current_rast_prim = rast_prim;
> sctx->do_update_shaders = true;
> }
>
> + if (sctx->gs_shader.cso) {
> + /* Determine whether the GS triangle strip adjacency fix should
> + * be applied. Rotate every other triangle if
> + * - triangle strips with adjacency are fed to the GS and
> + * - primitive restart is disabled (the rotation doesn't help
> + * when the restart occurs after an odd number of triangles).
> + */
> + bool gs_tri_strip_adj_fix =
> + !sctx->tcs_shader.cso && !sctx->tes_shader.cso &&
No need to check tcs_shader. Tess is enabled if tes_shader is set and
disabled otherwise.
With all the above fixed, the series is:
Reviewed-by: Marek Olšák <marek.olsak at amd.com>
Marek
More information about the mesa-dev
mailing list