[Mesa-dev] [PATCH 31/31] radeonsi: generate GS prolog to (partially) fix triangle strip adjacency rotation

Marek Olšák maraeo at gmail.com
Wed Nov 2 21:11:29 UTC 2016


On Mon, Oct 31, 2016 at 11:11 PM, Nicolai Hähnle <nhaehnle at gmail.com> wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Fixes GL45-CTS.geometry_shader.adjacency.adjacency_indiced_triangle_strip and
> others.
>
> This leaves the case of triangle strips with adjacency and primitive restarts
> open. It seems that the only thing that cares about that is a piglit test.
> Fixing this efficiently would be really involved, and I don't want to use the
> hammer of degrading to software handling of indices because there may well
> be software that uses this draw mode (without caring about the precise
> rotation of triangles).
> ---
>  src/gallium/drivers/radeonsi/si_pipe.c          |   1 +
>  src/gallium/drivers/radeonsi/si_pipe.h          |   2 +
>  src/gallium/drivers/radeonsi/si_shader.c        | 112 ++++++++++++++++++++++++
>  src/gallium/drivers/radeonsi/si_shader.h        |  10 +++
>  src/gallium/drivers/radeonsi/si_state_draw.c    |  18 ++++
>  src/gallium/drivers/radeonsi/si_state_shaders.c |   7 +-
>  6 files changed, 146 insertions(+), 4 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index bf3b442..bc633bb 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -665,20 +665,21 @@ static int si_get_shader_param(struct pipe_screen* pscreen, unsigned shader, enu
>         return 0;
>  }
>
>  static void si_destroy_screen(struct pipe_screen* pscreen)
>  {
>         struct si_screen *sscreen = (struct si_screen *)pscreen;
>         struct si_shader_part *parts[] = {
>                 sscreen->vs_prologs,
>                 sscreen->vs_epilogs,
>                 sscreen->tcs_epilogs,
> +               sscreen->gs_prologs,
>                 sscreen->ps_prologs,
>                 sscreen->ps_epilogs
>         };
>         unsigned i;
>
>         if (!sscreen)
>                 return;
>
>         if (!sscreen->b.ws->unref(sscreen->b.ws))
>                 return;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.h b/src/gallium/drivers/radeonsi/si_pipe.h
> index e7617bc..8e6a94d 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.h
> +++ b/src/gallium/drivers/radeonsi/si_pipe.h
> @@ -89,20 +89,21 @@ struct si_screen {
>         bool                            has_ds_bpermute;
>
>         /* Whether shaders are monolithic (1-part) or separate (3-part). */
>         bool                            use_monolithic_shaders;
>         bool                            record_llvm_ir;
>
>         pipe_mutex                      shader_parts_mutex;
>         struct si_shader_part           *vs_prologs;
>         struct si_shader_part           *vs_epilogs;
>         struct si_shader_part           *tcs_epilogs;
> +       struct si_shader_part           *gs_prologs;
>         struct si_shader_part           *ps_prologs;
>         struct si_shader_part           *ps_epilogs;
>
>         /* Shader cache in memory.
>          *
>          * Design & limitations:
>          * - The shader cache is per screen (= per process), never saved to
>          *   disk, and skips redundant shader compilations from TGSI to bytecode.
>          * - It can only be used with one-variant-per-shader support, in which
>          *   case only the main (typically middle) part of shaders is cached.
> @@ -312,20 +313,21 @@ struct si_context {
>         int                     last_sh_base_reg;
>         int                     last_primitive_restart_en;
>         int                     last_restart_index;
>         int                     last_gs_out_prim;
>         int                     last_prim;
>         int                     last_multi_vgt_param;
>         int                     last_rast_prim;
>         unsigned                last_sc_line_stipple;
>         int                     last_vtx_reuse_depth;
>         int                     current_rast_prim; /* primitive type after TES, GS */
> +       bool                    gs_tri_strip_adj_fix;
>         unsigned                last_gsvs_itemsize;
>
>         /* Scratch buffer */
>         struct r600_resource    *scratch_buffer;
>         bool                    emit_scratch_reloc;
>         unsigned                scratch_waves;
>         unsigned                spi_tmpring_size;
>
>         struct r600_resource    *compute_scratch_buffer;
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index fe15420..9141d62 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -6740,20 +6740,92 @@ static void si_get_ps_epilog_key(struct si_shader *shader,
>         struct tgsi_shader_info *info = &shader->selector->info;
>         memset(key, 0, sizeof(*key));
>         key->ps_epilog.colors_written = info->colors_written;
>         key->ps_epilog.writes_z = info->writes_z;
>         key->ps_epilog.writes_stencil = info->writes_stencil;
>         key->ps_epilog.writes_samplemask = info->writes_samplemask;
>         key->ps_epilog.states = shader->key.ps.epilog;
>  }
>
>  /**
> + * Build the GS prolog function. Rotate the input vertices for triangle strips
> + * with adjacency.
> + */
> +static void si_build_gs_prolog_function(struct si_shader_context *ctx,
> +                                       union si_shader_part_key *key)
> +{
> +       const unsigned num_sgprs = SI_GS_NUM_USER_SGPR + 2;
> +       const unsigned num_vgprs = 8;
> +       struct gallivm_state *gallivm = &ctx->gallivm;
> +       LLVMBuilderRef builder = gallivm->builder;
> +       LLVMTypeRef params[32];
> +       LLVMTypeRef returns[32];
> +       LLVMValueRef func, ret;
> +
> +       for (unsigned i = 0; i < num_sgprs; ++i) {
> +               params[i] = ctx->i32;
> +               returns[i] = ctx->i32;
> +       }
> +
> +       for (unsigned i = 0; i < num_vgprs; ++i) {
> +               params[num_sgprs + i] = ctx->i32;
> +               returns[num_sgprs + i] = ctx->f32;
> +       }
> +
> +       /* Create the function. */
> +       si_create_function(ctx, "gs_prolog", returns, num_sgprs + num_vgprs,
> +                          params, num_sgprs + num_vgprs, num_sgprs - 1);
> +       func = ctx->main_fn;
> +
> +       /* Copy inputs to outputs. This should be no-op, as the registers match,
> +        * but it will prevent the compiler from overwriting them unintentionally.
> +        */
> +       ret = ctx->return_value;
> +       for (unsigned i = 0; i < num_sgprs; i++) {
> +               LLVMValueRef p = LLVMGetParam(func, i);
> +               ret = LLVMBuildInsertValue(builder, ret, p, i, "");
> +       }
> +       for (unsigned i = 0; i < num_vgprs; i++) {
> +               LLVMValueRef p = LLVMGetParam(func, num_sgprs + i);
> +               p = LLVMBuildBitCast(builder, p, ctx->f32, "");
> +               ret = LLVMBuildInsertValue(builder, ret, p, num_sgprs + i, "");
> +       }
> +
> +       if (key->gs_prolog.states.tri_strip_adj_fix) {
> +               /* Remap the input vertices for every other primitive. */
> +               const unsigned vtx_params[6] = {
> +                       num_sgprs,
> +                       num_sgprs + 1,
> +                       num_sgprs + 3,
> +                       num_sgprs + 4,
> +                       num_sgprs + 5,
> +                       num_sgprs + 6
> +               };
> +               LLVMValueRef prim_id, rotate;
> +
> +               prim_id = LLVMGetParam(func, num_sgprs + 2);
> +               rotate = LLVMBuildTrunc(builder, prim_id, ctx->i1, "");
> +
> +               for (unsigned i = 0; i < 6; ++i) {
> +                       LLVMValueRef base, rotated, actual;
> +                       base = LLVMGetParam(func, vtx_params[i]);
> +                       rotated = LLVMGetParam(func, vtx_params[(i + 4) % 6]);
> +                       actual = LLVMBuildSelect(builder, rotate, rotated, base, "");
> +                       actual = LLVMBuildBitCast(builder, actual, ctx->f32, "");
> +                       ret = LLVMBuildInsertValue(builder, ret, actual, vtx_params[i], "");
> +               }
> +       }
> +
> +       LLVMBuildRet(builder, ret);
> +}
> +
> +/**
>   * Given a list of shader part functions, build a wrapper function that
>   * runs them in sequence to form a monolithic shader.
>   */
>  static void si_build_wrapper_function(struct si_shader_context *ctx,
>                                       LLVMValueRef *parts,
>                                       unsigned num_parts,
>                                       unsigned main_part)
>  {
>         struct gallivm_state *gallivm = &ctx->gallivm;
>         LLVMBuilderRef builder = ctx->gallivm.builder;
> @@ -7012,20 +7084,32 @@ int si_compile_tgsi_shader(struct si_screen *sscreen,
>                 LLVMValueRef parts[2];
>                 union si_shader_part_key epilog_key;
>
>                 parts[0] = ctx.main_fn;
>
>                 si_get_vs_epilog_key(shader, &shader->key.tes.epilog, &epilog_key);
>                 si_build_vs_epilog_function(&ctx, &epilog_key);
>                 parts[1] = ctx.main_fn;
>
>                 si_build_wrapper_function(&ctx, parts, 2, 0);
> +       } else if (is_monolithic && ctx.type == PIPE_SHADER_GEOMETRY) {
> +               LLVMValueRef parts[2];
> +               union si_shader_part_key prolog_key;
> +
> +               parts[1] = ctx.main_fn;
> +
> +               memset(&prolog_key, 0, sizeof(prolog_key));
> +               prolog_key.gs_prolog.states = shader->key.gs.prolog;
> +               si_build_gs_prolog_function(&ctx, &prolog_key);
> +               parts[0] = ctx.main_fn;
> +
> +               si_build_wrapper_function(&ctx, parts, 2, 1);
>         } else if (is_monolithic && ctx.type == PIPE_SHADER_FRAGMENT) {
>                 LLVMValueRef parts[3];
>                 union si_shader_part_key prolog_key;
>                 union si_shader_part_key epilog_key;
>                 bool need_prolog;
>
>                 si_get_ps_prolog_key(shader, &prolog_key, false);
>                 need_prolog = si_need_ps_prolog(&prolog_key);
>
>                 parts[need_prolog ? 1 : 0] = ctx.main_fn;
> @@ -7200,20 +7284,23 @@ si_get_shader_part(struct si_screen *sscreen,
>         si_init_shader_ctx(&ctx, sscreen, &shader, tm);
>         ctx.type = type;
>
>         switch (type) {
>         case PIPE_SHADER_VERTEX:
>                 break;
>         case PIPE_SHADER_TESS_CTRL:
>                 assert(!prolog);
>                 shader.key.tcs.epilog = key->tcs_epilog.states;
>                 break;
> +       case PIPE_SHADER_GEOMETRY:
> +               assert(prolog);
> +               break;
>         case PIPE_SHADER_FRAGMENT:
>                 if (prolog)
>                         shader.key.ps.prolog = key->ps_prolog.states;
>                 else
>                         shader.key.ps.epilog = key->ps_epilog.states;
>                 break;
>         default:
>                 unreachable("bad shader part");
>         }
>
> @@ -7524,20 +7611,41 @@ static bool si_shader_select_tcs_parts(struct si_screen *sscreen,
>
>         shader->epilog = si_get_shader_part(sscreen, &sscreen->tcs_epilogs,
>                                             PIPE_SHADER_TESS_CTRL, false,
>                                             &epilog_key, tm, debug,
>                                             si_build_tcs_epilog_function,
>                                             "Tessellation Control Shader Epilog");
>         return shader->epilog != NULL;
>  }
>
>  /**
> + * Select and compile (or reuse) GS parts (prolog).
> + */
> +static bool si_shader_select_gs_parts(struct si_screen *sscreen,
> +                                     LLVMTargetMachineRef tm,
> +                                     struct si_shader *shader,
> +                                     struct pipe_debug_callback *debug)
> +{
> +       union si_shader_part_key prolog_key;
> +
> +       memset(&prolog_key, 0, sizeof(prolog_key));
> +       prolog_key.gs_prolog.states = shader->key.gs.prolog;
> +

If the workaround is disabled, you can do return here and skip
building an empty prolog.

> +       shader->prolog = si_get_shader_part(sscreen, &sscreen->gs_prologs,
> +                                           PIPE_SHADER_GEOMETRY, true,
> +                                           &prolog_key, tm, debug,
> +                                           si_build_gs_prolog_function,
> +                                           "Geometry Shader Prolog");
> +       return shader->prolog != NULL;
> +}
> +
> +/**
>   * Build the pixel shader prolog function. This handles:
>   * - two-side color selection and interpolation
>   * - overriding interpolation parameters for the API PS
>   * - polygon stippling
>   *
>   * All preloaded SGPRs and VGPRs are passed through unmodified unless they are
>   * overriden by other states. (e.g. per-sample interpolation)
>   * Interpolated colors are stored after the preloaded VGPRs.
>   */
>  static void si_build_ps_prolog_function(struct si_shader_context *ctx,
> @@ -8040,20 +8148,24 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
>                                 return -1;
>                         break;
>                 case PIPE_SHADER_TESS_CTRL:
>                         if (!si_shader_select_tcs_parts(sscreen, tm, shader, debug))
>                                 return -1;
>                         break;
>                 case PIPE_SHADER_TESS_EVAL:
>                         if (!si_shader_select_tes_parts(sscreen, tm, shader, debug))
>                                 return -1;
>                         break;
> +               case PIPE_SHADER_GEOMETRY:
> +                       if (!si_shader_select_gs_parts(sscreen, tm, shader, debug))
> +                               return - 1;

Unnecessary space between - and 1.

> +                       break;
>                 case PIPE_SHADER_FRAGMENT:
>                         if (!si_shader_select_ps_parts(sscreen, tm, shader, debug))
>                                 return -1;
>
>                         /* Make sure we have at least as many VGPRs as there
>                          * are allocated inputs.
>                          */
>                         shader->config.num_vgprs = MAX2(shader->config.num_vgprs,
>                                                         shader->info.num_input_vgprs);
>                         break;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 91f9cbf..d8ab2a4 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -318,20 +318,24 @@ struct si_vs_epilog_bits {
>          *   if PS doesn't read them
>          */
>  };
>
>  /* Common TCS bits between the shader key and the epilog key. */
>  struct si_tcs_epilog_bits {
>         unsigned        prim_mode:3;
>         uint64_t        inputs_to_copy;
>  };
>
> +struct si_gs_prolog_bits {
> +       unsigned        tri_strip_adj_fix:1;
> +};
> +
>  /* Common PS bits between the shader key and the prolog key. */
>  struct si_ps_prolog_bits {
>         unsigned        color_two_side:1;
>         unsigned        flatshade_colors:1;
>         unsigned        poly_stipple:1;
>         unsigned        force_persp_sample_interp:1;
>         unsigned        force_linear_sample_interp:1;
>         unsigned        force_persp_center_interp:1;
>         unsigned        force_linear_center_interp:1;
>         unsigned        bc_optimize_for_persp:1;
> @@ -356,20 +360,23 @@ union si_shader_part_key {
>                 unsigned        last_input:4;
>         } vs_prolog;
>         struct {
>                 struct si_vs_epilog_bits states;
>                 unsigned        prim_id_param_offset:5;
>         } vs_epilog;
>         struct {
>                 struct si_tcs_epilog_bits states;
>         } tcs_epilog;
>         struct {
> +               struct si_gs_prolog_bits states;
> +       } gs_prolog;
> +       struct {
>                 struct si_ps_prolog_bits states;
>                 unsigned        num_input_sgprs:5;
>                 unsigned        num_input_vgprs:5;
>                 /* Color interpolation and two-side color selection. */
>                 unsigned        colors_read:8; /* color input components read */
>                 unsigned        num_interp_inputs:5; /* BCOLOR is at this location */
>                 unsigned        face_vgpr_index:5;
>                 unsigned        wqm:1;
>                 char            color_attr_index[2];
>                 char            color_interp_vgpr_index[2]; /* -1 == constant */
> @@ -394,20 +401,23 @@ union si_shader_key {
>                 unsigned        as_es:1; /* export shader */
>                 unsigned        as_ls:1; /* local shader */
>         } vs;
>         struct {
>                 struct si_tcs_epilog_bits epilog;
>         } tcs; /* tessellation control shader */
>         struct {
>                 struct si_vs_epilog_bits epilog; /* same as VS */
>                 unsigned        as_es:1; /* export shader */
>         } tes; /* tessellation evaluation shader */
> +       struct {
> +               struct si_gs_prolog_bits prolog;
> +       } gs;
>  };
>
>  struct si_shader_config {
>         unsigned                        num_sgprs;
>         unsigned                        num_vgprs;
>         unsigned                        spilled_sgprs;
>         unsigned                        spilled_vgprs;
>         unsigned                        lds_size;
>         unsigned                        spi_ps_input_ena;
>         unsigned                        spi_ps_input_addr;
> diff --git a/src/gallium/drivers/radeonsi/si_state_draw.c b/src/gallium/drivers/radeonsi/si_state_draw.c
> index c0e2642..b934100 100644
> --- a/src/gallium/drivers/radeonsi/si_state_draw.c
> +++ b/src/gallium/drivers/radeonsi/si_state_draw.c
> @@ -975,20 +975,38 @@ void si_draw_vbo(struct pipe_context *ctx, const struct pipe_draw_info *info)
>         else if (sctx->tes_shader.cso)
>                 rast_prim = sctx->tes_shader.cso->info.properties[TGSI_PROPERTY_TES_PRIM_MODE];
>         else
>                 rast_prim = info->mode;
>
>         if (rast_prim != sctx->current_rast_prim) {
>                 sctx->current_rast_prim = rast_prim;
>                 sctx->do_update_shaders = true;
>         }
>
> +       if (sctx->gs_shader.cso) {
> +               /* Determine whether the GS triangle strip adjacency fix should
> +                * be applied. Rotate every other triangle if
> +                * - triangle strips with adjacency are fed to the GS and
> +                * - primitive restart is disabled (the rotation doesn't help
> +                *   when the restart occurs after an odd number of triangles).
> +                */
> +               bool gs_tri_strip_adj_fix =
> +                       !sctx->tcs_shader.cso && !sctx->tes_shader.cso &&

No need to check tcs_shader. Tess is enabled if tes_shader is set and
disabled otherwise.

With all the above fixed, the series is:

Reviewed-by: Marek Olšák <marek.olsak at amd.com>

Marek


More information about the mesa-dev mailing list