[Mesa-dev] [PATCH 2/2] radeonsi: move instance divisors into a constant buffer

Nicolai Hähnle nhaehnle at gmail.com
Tue Jun 27 07:22:05 UTC 2017


On 27.06.2017 02:14, Marek Olšák wrote:
> From: Marek Olšák <marek.olsak at amd.com>
> 
> Shader key size: 107 -> 47

Nice improvement.


> Divisors of 0 and 1 are encoded in the shader key. Greater instance divisors
> are loaded from a constant buffer.
> 
> The shader code doing the division is huge. Is it something we need to
> worry about? Does any app use instance divisors >= 2?

This reminds me of a certain LLVM improvement that I still need to clear.

I doubt instance divisors >= 2 are used. As a data point, Vulkan doesn't 
support it as a feature at all, IIRC.

Can we get an optimized monotholic shader variant built for shaders that 
have to fetch? This should help if anybody ever triggers this, because 
surely not all vertex attributes will require it, and at least the 
latency of the loads can then be used to perhaps hide some of the divide 
cost.

Apart from that, patch is

Reviewed-by: Nicolai Hähnle <nicolai.haehnle at amd.com>


> 
> VS prolog disassembly:
>      s_load_dwordx4 s[12:15], s[0:1], 0x80  ; C00A0300 00000080
>      s_nop 0                                ; BF800000
>      s_waitcnt lgkmcnt(0)                   ; BF8C007F
>      s_buffer_load_dword s14, s[12:15], 0x4 ; C0220386 00000004
>      s_waitcnt lgkmcnt(0)                   ; BF8C007F
>      v_cvt_f32_u32_e32 v4, s14              ; 7E080C0E
>      v_rcp_iflag_f32_e32 v4, v4             ; 7E084704
>      v_mul_f32_e32 v4, 0x4f800000, v4       ; 0A0808FF 4F800000
>      v_cvt_u32_f32_e32 v4, v4               ; 7E080F04
>      v_mul_hi_u32 v5, v4, s14               ; D2860005 00001D04
>      v_mul_lo_i32 v6, v4, s14               ; D2850006 00001D04
>      v_cmp_eq_u32_e64 s[12:13], 0, v5       ; D0CA000C 00020A80
>      v_sub_i32_e32 v5, vcc, 0, v6           ; 340A0C80
>      v_cndmask_b32_e64 v5, v6, v5, s[12:13] ; D1000005 00320B06
>      v_mul_hi_u32 v5, v5, v4                ; D2860005 00020905
>      v_add_i32_e32 v6, vcc, v5, v4          ; 320C0905
>      v_subrev_i32_e32 v4, vcc, v5, v4       ; 36080905
>      v_cndmask_b32_e64 v4, v4, v6, s[12:13] ; D1000004 00320D04
>      v_mul_hi_u32 v5, v4, v1                ; D2860005 00020304
>      v_add_i32_e32 v4, vcc, s8, v0          ; 32080008
>      v_mul_lo_i32 v6, v5, s14               ; D2850006 00001D05
>      v_add_i32_e32 v7, vcc, 1, v5           ; 320E0A81
>      v_cmp_ge_u32_e64 s[12:13], v1, v6      ; D0CE000C 00020D01
>      v_sub_i32_e32 v6, vcc, v1, v6          ; 340C0D01
>      v_cmp_le_u32_e32 vcc, s14, v6          ; 7D960C0E
>      v_cndmask_b32_e64 v8, 0, -1, s[12:13]  ; D1000008 00318280
>      v_cndmask_b32_e64 v6, 0, -1, vcc       ; D1000006 01A98280
>      v_and_b32_e32 v6, v8, v6               ; 260C0D08
>      v_cmp_eq_u32_e32 vcc, 0, v6            ; 7D940C80
>      v_cndmask_b32_e32 v6, v7, v5, vcc      ; 000C0B07
>      v_add_i32_e32 v5, vcc, -1, v5          ; 320A0AC1
>      v_cmp_eq_u32_e32 vcc, 0, v8            ; 7D941080
>      v_cndmask_b32_e32 v5, v6, v5, vcc      ; 000A0B06
>      v_add_i32_e32 v5, vcc, s9, v5          ; 320A0A09
> ---
>   src/gallium/drivers/radeonsi/si_descriptors.c   |  2 +
>   src/gallium/drivers/radeonsi/si_pipe.c          |  2 +
>   src/gallium/drivers/radeonsi/si_shader.c        | 78 +++++++++++++++++--------
>   src/gallium/drivers/radeonsi/si_shader.h        |  9 ++-
>   src/gallium/drivers/radeonsi/si_state.c         | 15 +++++
>   src/gallium/drivers/radeonsi/si_state.h         |  3 +
>   src/gallium/drivers/radeonsi/si_state_shaders.c |  7 ++-
>   7 files changed, 88 insertions(+), 28 deletions(-)
> 
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 75d2a1d..88f7dce 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -2185,20 +2185,22 @@ void si_emit_graphics_shader_userdata(struct si_context *sctx,
>   					       R_00B208_SPI_SHADER_USER_DATA_ADDR_LO_GS);
>   			si_emit_shader_pointer(sctx, descs,
>   					       R_00B408_SPI_SHADER_USER_DATA_ADDR_LO_HS);
>   		} else {
>   			si_emit_shader_pointer(sctx, descs,
>   					       R_00B230_SPI_SHADER_USER_DATA_GS_0);
>   			si_emit_shader_pointer(sctx, descs,
>   					       R_00B330_SPI_SHADER_USER_DATA_ES_0);
>   			si_emit_shader_pointer(sctx, descs,
>   					       R_00B430_SPI_SHADER_USER_DATA_HS_0);
> +			si_emit_shader_pointer(sctx, descs,
> +					       R_00B530_SPI_SHADER_USER_DATA_LS_0);
>   		}
>   	}
>   
>   	mask = sctx->shader_pointers_dirty &
>   	       u_bit_consecutive(SI_DESCS_FIRST_SHADER,
>   				 SI_DESCS_FIRST_COMPUTE - SI_DESCS_FIRST_SHADER);
>   
>   	while (mask) {
>   		unsigned i = u_bit_scan(&mask);
>   		unsigned shader = (i - SI_DESCS_FIRST_SHADER) / SI_NUM_SHADER_DESCS;
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 4088849..a940bb8 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -301,20 +301,22 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen,
>   
>   		for (shader = 0; shader < SI_NUM_SHADERS; shader++) {
>   			for (i = 0; i < SI_NUM_CONST_BUFFERS; i++) {
>   				sctx->b.b.set_constant_buffer(&sctx->b.b, shader, i,
>   							      &sctx->null_const_buf);
>   			}
>   		}
>   
>   		si_set_rw_buffer(sctx, SI_HS_CONST_DEFAULT_TESS_LEVELS,
>   				 &sctx->null_const_buf);
> +		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS,
> +				 &sctx->null_const_buf);
>   		si_set_rw_buffer(sctx, SI_VS_CONST_CLIP_PLANES,
>   				 &sctx->null_const_buf);
>   		si_set_rw_buffer(sctx, SI_PS_CONST_POLY_STIPPLE,
>   				 &sctx->null_const_buf);
>   		si_set_rw_buffer(sctx, SI_PS_CONST_SAMPLE_POSITIONS,
>   				 &sctx->null_const_buf);
>   
>   		/* Clear the NULL constant buffer, because loads should return zeros. */
>   		sctx->b.clear_buffer(&sctx->b.b, sctx->null_const_buf.buffer, 0,
>   				     sctx->null_const_buf.buffer->width0, 0,
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 42b08bf..55d1232 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -305,31 +305,30 @@ get_tcs_out_current_patch_data_offset(struct si_shader_context *ctx)
>   	LLVMValueRef rel_patch_id = get_rel_patch_id(ctx);
>   
>   	return LLVMBuildAdd(gallivm->builder, patch0_patch_data_offset,
>   			    LLVMBuildMul(gallivm->builder, patch_stride,
>   					 rel_patch_id, ""),
>   			    "");
>   }
>   
>   static LLVMValueRef get_instance_index_for_fetch(
>   	struct si_shader_context *ctx,
> -	unsigned param_start_instance, unsigned divisor)
> +	unsigned param_start_instance, LLVMValueRef divisor)
>   {
>   	struct gallivm_state *gallivm = &ctx->gallivm;
>   
>   	LLVMValueRef result = LLVMGetParam(ctx->main_fn,
>   					   ctx->param_instance_id);
>   
>   	/* The division must be done before START_INSTANCE is added. */
> -	if (divisor > 1)
> -		result = LLVMBuildUDiv(gallivm->builder, result,
> -				LLVMConstInt(ctx->i32, divisor, 0), "");
> +	if (divisor != ctx->i32_1)
> +		result = LLVMBuildUDiv(gallivm->builder, result, divisor, "");
>   
>   	return LLVMBuildAdd(gallivm->builder, result,
>   			    LLVMGetParam(ctx->main_fn, param_start_instance), "");
>   }
>   
>   /* Bitcast <4 x float> to <2 x double>, extract the component, and convert
>    * to float. */
>   static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
>   					    LLVMValueRef vec4,
>   					    unsigned double_index)
> @@ -5275,26 +5274,24 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
>   		FREE(shader);
>   		shader = NULL;
>   	}
>   	return shader;
>   }
>   
>   static void si_dump_shader_key_vs(const struct si_shader_key *key,
>   				  const struct si_vs_prolog_bits *prolog,
>   				  const char *prefix, FILE *f)
>   {
> -	fprintf(f, "  %s.instance_divisors = {", prefix);
> -	for (int i = 0; i < ARRAY_SIZE(prolog->instance_divisors); i++) {
> -		fprintf(f, !i ? "%u" : ", %u",
> -			prolog->instance_divisors[i]);
> -	}
> -	fprintf(f, "}\n");
> +	fprintf(f, "  %s.instance_divisor_is_one = %u\n",
> +		prefix, prolog->instance_divisor_is_one);
> +	fprintf(f, "  %s.instance_divisor_is_fetched = %u\n",
> +		prefix, prolog->instance_divisor_is_fetched);
>   
>   	fprintf(f, "  mono.vs.fix_fetch = {");
>   	for (int i = 0; i < SI_MAX_ATTRIBS; i++)
>   		fprintf(f, !i ? "%u" : ", %u", key->mono.vs_fix_fetch[i]);
>   	fprintf(f, "}\n");
>   }
>   
>   static void si_dump_shader_key(unsigned processor, const struct si_shader *shader,
>   			       FILE *f)
>   {
> @@ -5596,24 +5593,26 @@ static void si_get_vs_prolog_key(const struct tgsi_shader_info *info,
>   	key->vs_prolog.last_input = MAX2(1, info->num_inputs) - 1;
>   	key->vs_prolog.as_ls = shader_out->key.as_ls;
>   
>   	if (shader_out->selector->type == PIPE_SHADER_TESS_CTRL) {
>   		key->vs_prolog.as_ls = 1;
>   		key->vs_prolog.num_merged_next_stage_vgprs = 2;
>   	} else if (shader_out->selector->type == PIPE_SHADER_GEOMETRY) {
>   		key->vs_prolog.num_merged_next_stage_vgprs = 5;
>   	}
>   
> -	/* Set the instanceID flag. */
> -	for (unsigned i = 0; i < info->num_inputs; i++)
> -		if (key->vs_prolog.states.instance_divisors[i])
> -			shader_out->info.uses_instanceid = true;
> +	/* Enable loading the InstanceID VGPR. */
> +	uint16_t input_mask = u_bit_consecutive(0, info->num_inputs);
> +
> +	if ((key->vs_prolog.states.instance_divisor_is_one |
> +	     key->vs_prolog.states.instance_divisor_is_fetched) & input_mask)
> +		shader_out->info.uses_instanceid = true;
>   }
>   
>   /**
>    * Compute the PS prolog key, which contains all the information needed to
>    * build the PS prolog function, and set related bits in shader->config.
>    */
>   static void si_get_ps_prolog_key(struct si_shader *shader,
>   				 union si_shader_part_key *key,
>   				 bool separate_prolog)
>   {
> @@ -6520,20 +6519,35 @@ si_get_shader_part(struct si_screen *sscreen,
>   
>   	result->next = *list;
>   	*list = result;
>   
>   out:
>   	si_llvm_dispose(&ctx);
>   	mtx_unlock(&sscreen->shader_parts_mutex);
>   	return result;
>   }
>   
> +static LLVMValueRef si_prolog_get_rw_buffers(struct si_shader_context *ctx)
> +{
> +	struct gallivm_state *gallivm = &ctx->gallivm;
> +	LLVMValueRef ptr[2], list;
> +
> +	/* Get the pointer to rw buffers. */
> +	ptr[0] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS);
> +	ptr[1] = LLVMGetParam(ctx->main_fn, SI_SGPR_RW_BUFFERS_HI);
> +	list = lp_build_gather_values(gallivm, ptr, 2);
> +	list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
> +	list = LLVMBuildIntToPtr(gallivm->builder, list,
> +				 si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
> +	return list;
> +}
> +
>   /**
>    * Build the vertex shader prolog function.
>    *
>    * The inputs are the same as VS (a lot of SGPRs and 4 VGPR system values).
>    * All inputs are returned unmodified. The vertex load indices are
>    * stored after them, which will be used by the API VS for fetching inputs.
>    *
>    * For example, the expected outputs for instance_divisors[] = {0, 1, 2} are:
>    *   input_v0,
>    *   input_v1,
> @@ -6602,25 +6616,47 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
>   		LLVMValueRef p = LLVMGetParam(func, i);
>   		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
>   	}
>   	for (; i < num_params; i++) {
>   		LLVMValueRef p = LLVMGetParam(func, i);
>   		p = LLVMBuildBitCast(gallivm->builder, p, ctx->f32, "");
>   		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
>   	}
>   
>   	/* Compute vertex load indices from instance divisors. */
> +	LLVMValueRef instance_divisor_constbuf = NULL;
> +
> +	if (key->vs_prolog.states.instance_divisor_is_fetched) {
> +		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
> +		LLVMValueRef buf_index =
> +			LLVMConstInt(ctx->i32, SI_VS_CONST_INSTANCE_DIVISORS, 0);
> +		instance_divisor_constbuf =
> +			ac_build_indexed_load_const(&ctx->ac, list, buf_index);
> +	}
> +
>   	for (i = 0; i <= key->vs_prolog.last_input; i++) {
> -		unsigned divisor = key->vs_prolog.states.instance_divisors[i];
> +		bool divisor_is_one =
> +			key->vs_prolog.states.instance_divisor_is_one & (1u << i);
> +		bool divisor_is_fetched =
> +			key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
>   		LLVMValueRef index;
>   
> -		if (divisor) {
> +		if (divisor_is_one || divisor_is_fetched) {
> +			LLVMValueRef divisor = ctx->i32_1;
> +
> +			if (divisor_is_fetched) {
> +				divisor = buffer_load_const(ctx, instance_divisor_constbuf,
> +							    LLVMConstInt(ctx->i32, i * 4, 0));
> +				divisor = LLVMBuildBitCast(gallivm->builder, divisor,
> +							   ctx->i32, "");
> +			}
> +
>   			/* InstanceID / Divisor + StartInstance */
>   			index = get_instance_index_for_fetch(ctx,
>   							     user_sgpr_base +
>   							     SI_SGPR_START_INSTANCE,
>   							     divisor);
>   		} else {
>   			/* VertexID + BaseVertex */
>   			index = LLVMBuildAdd(gallivm->builder,
>   					     LLVMGetParam(func, ctx->param_vertex_id),
>   					     LLVMGetParam(func, user_sgpr_base +
> @@ -6859,29 +6895,21 @@ static void si_build_ps_prolog_function(struct si_shader_context *ctx,
>   	for (i = 0; i < num_params; i++) {
>   		LLVMValueRef p = LLVMGetParam(func, i);
>   		ret = LLVMBuildInsertValue(gallivm->builder, ret, p, i, "");
>   	}
>   
>   	/* Polygon stippling. */
>   	if (key->ps_prolog.states.poly_stipple) {
>   		/* POS_FIXED_PT is always last. */
>   		unsigned pos = key->ps_prolog.num_input_sgprs +
>   			       key->ps_prolog.num_input_vgprs - 1;
> -		LLVMValueRef ptr[2], list;
> -
> -		/* Get the pointer to rw buffers. */
> -		ptr[0] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS);
> -		ptr[1] = LLVMGetParam(func, SI_SGPR_RW_BUFFERS_HI);
> -		list = lp_build_gather_values(gallivm, ptr, 2);
> -		list = LLVMBuildBitCast(gallivm->builder, list, ctx->i64, "");
> -		list = LLVMBuildIntToPtr(gallivm->builder, list,
> -					  si_const_array(ctx->v4i32, SI_NUM_RW_BUFFERS), "");
> +		LLVMValueRef list = si_prolog_get_rw_buffers(ctx);
>   
>   		si_llvm_emit_polygon_stipple(ctx, list, pos);
>   	}
>   
>   	if (key->ps_prolog.states.bc_optimize_for_persp ||
>   	    key->ps_prolog.states.bc_optimize_for_linear) {
>   		unsigned i, base = key->ps_prolog.num_input_sgprs;
>   		LLVMValueRef center[2], centroid[2], tmp, bc_optimize;
>   
>   		/* The shader should do: if (PRIM_MASK[31]) CENTROID = CENTER;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
> index 6432126..a10067d 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.h
> +++ b/src/gallium/drivers/radeonsi/si_shader.h
> @@ -378,21 +378,28 @@ struct si_shader_selector {
>    * -> = merged with the next stage
>    */
>   
>   /* Use the byte alignment for all following structure members for optimal
>    * shader key memory footprint.
>    */
>   #pragma pack(push, 1)
>   
>   /* Common VS bits between the shader key and the prolog key. */
>   struct si_vs_prolog_bits {
> -	unsigned	instance_divisors[SI_MAX_ATTRIBS];
> +	/* - If neither "is_one" nor "is_fetched" has a bit set, the instance
> +	 *   divisor is 0.
> +	 * - If "is_one" has a bit set, the instance divisor is 1.
> +	 * - If "is_fetched" has a bit set, the instance divisor will be loaded
> +	 *   from the constant buffer.
> +	 */
> +	uint16_t	instance_divisor_is_one;     /* bitmask of inputs */
> +	uint16_t	instance_divisor_is_fetched; /* bitmask of inputs */
>   };
>   
>   /* Common TCS bits between the shader key and the epilog key. */
>   struct si_tcs_epilog_bits {
>   	unsigned	prim_mode:3;
>   	unsigned	tes_reads_tess_factors:1;
>   };
>   
>   struct si_gs_prolog_bits {
>   	unsigned	tri_strip_adj_fix:1;
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index a674a60..7e3d1a0 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -3766,20 +3766,25 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
>   		unsigned char swizzle[4];
>   
>   		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
>   			FREE(v);
>   			return NULL;
>   		}
>   
>   		if (elements[i].instance_divisor) {
>   			v->uses_instance_divisors = true;
>   			v->instance_divisors[i] = elements[i].instance_divisor;
> +
> +			if (v->instance_divisors[i] == 1)
> +				v->instance_divisor_is_one |= 1u << i;
> +			else
> +				v->instance_divisor_is_fetched |= 1u << i;
>   		}
>   
>   		if (!used[vbo_index]) {
>   			v->first_vb_use_mask |= 1 << i;
>   			used[vbo_index] = true;
>   		}
>   
>   		desc = util_format_description(elements[i].src_format);
>   		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
>   		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
> @@ -3894,20 +3899,30 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
>   	sctx->vertex_elements = v;
>   	sctx->vertex_buffers_dirty = true;
>   
>   	if (v &&
>   	    (!old ||
>   	     old->count != v->count ||
>   	     old->uses_instance_divisors != v->uses_instance_divisors ||
>   	     v->uses_instance_divisors || /* we don't check which divisors changed */
>   	     memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
>   		sctx->do_update_shaders = true;
> +
> +	if (v && v->instance_divisor_is_fetched) {
> +		struct pipe_constant_buffer cb;
> +
> +		cb.buffer = NULL;
> +		cb.user_buffer = v->instance_divisors;
> +		cb.buffer_offset = 0;
> +		cb.buffer_size = sizeof(uint32_t) * v->count;
> +		si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
> +	}
>   }
>   
>   static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
>   {
>   	struct si_context *sctx = (struct si_context *)ctx;
>   
>   	if (sctx->vertex_elements == state)
>   		sctx->vertex_elements = NULL;
>   	FREE(state);
>   }
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index c9e0770..ec28aba 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -108,20 +108,22 @@ struct si_vertex_elements
>   	uint8_t				fix_fetch[SI_MAX_ATTRIBS];
>   	uint8_t				format_size[SI_MAX_ATTRIBS];
>   	uint8_t				vertex_buffer_index[SI_MAX_ATTRIBS];
>   
>   	uint8_t				count;
>   	bool				uses_instance_divisors;
>   
>   	uint16_t			first_vb_use_mask;
>   	/* Vertex buffer descriptor list size aligned for optimal prefetch. */
>   	uint16_t			desc_list_byte_size;
> +	uint16_t			instance_divisor_is_one; /* bitmask of inputs */
> +	uint16_t			instance_divisor_is_fetched;  /* bitmask of inputs */
>   };
>   
>   union si_state {
>   	struct {
>   		struct si_state_blend		*blend;
>   		struct si_state_rasterizer	*rasterizer;
>   		struct si_state_dsa		*dsa;
>   		struct si_pm4_state		*poly_offset;
>   		struct si_pm4_state		*ls;
>   		struct si_pm4_state		*hs;
> @@ -175,20 +177,21 @@ enum {
>   	SI_GS_RING_ESGS,
>   
>   	SI_RING_GSVS,
>   
>   	SI_VS_STREAMOUT_BUF0,
>   	SI_VS_STREAMOUT_BUF1,
>   	SI_VS_STREAMOUT_BUF2,
>   	SI_VS_STREAMOUT_BUF3,
>   
>   	SI_HS_CONST_DEFAULT_TESS_LEVELS,
> +	SI_VS_CONST_INSTANCE_DIVISORS,
>   	SI_VS_CONST_CLIP_PLANES,
>   	SI_PS_CONST_POLY_STIPPLE,
>   	SI_PS_CONST_SAMPLE_POSITIONS,
>   
>   	SI_NUM_RW_BUFFERS,
>   };
>   
>   /* Indices into sctx->descriptors, laid out so that gfx and compute pipelines
>    * are contiguous:
>    *
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 4eb3b75..63cc746 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -1180,24 +1180,27 @@ static unsigned si_get_alpha_test_func(struct si_context *sctx)
>   }
>   
>   static void si_shader_selector_key_vs(struct si_context *sctx,
>   				      struct si_shader_selector *vs,
>   				      struct si_shader_key *key,
>   				      struct si_vs_prolog_bits *prolog_key)
>   {
>   	if (!sctx->vertex_elements)
>   		return;
>   
> +	prolog_key->instance_divisor_is_one =
> +		sctx->vertex_elements->instance_divisor_is_one;
> +	prolog_key->instance_divisor_is_fetched =
> +		sctx->vertex_elements->instance_divisor_is_fetched;
> +
>   	unsigned count = MIN2(vs->info.num_inputs,
>   			      sctx->vertex_elements->count);
> -	memcpy(prolog_key->instance_divisors,
> -	       sctx->vertex_elements->instance_divisors, count * 4);
>   	memcpy(key->mono.vs_fix_fetch, sctx->vertex_elements->fix_fetch, count);
>   }
>   
>   static void si_shader_selector_key_hw_vs(struct si_context *sctx,
>   					 struct si_shader_selector *vs,
>   					 struct si_shader_key *key)
>   {
>   	struct si_shader_selector *ps = sctx->ps_shader.cso;
>   
>   	key->opt.clip_disable =
> 


-- 
Lerne, wie die Welt wirklich ist,
Aber vergiss niemals, wie sie sein sollte.


More information about the mesa-dev mailing list