[Mesa-dev] [PATCH 2/4] radeonsi: implement legacy GL_DOUBLE vertex formats

Sat Feb 11 16:30:25 UTC 2017

From: Marek Olšák <marek.olsak at amd.com>

so that we can disable u_vbuf for GL core profiles.
---
 src/gallium/drivers/radeonsi/si_shader.c | 78 ++++++++++++++++++++++++++------
 src/gallium/drivers/radeonsi/si_shader.h |  4 ++
 src/gallium/drivers/radeonsi/si_state.c  | 56 ++++++++++++++++++++---
 3 files changed, 117 insertions(+), 21 deletions(-)

diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 7b89014..1c84e8d 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -320,70 +320,99 @@ static LLVMValueRef get_instance_index_for_fetch(
 
 	/* The division must be done before START_INSTANCE is added. */
 	if (divisor > 1)
 		result = LLVMBuildUDiv(gallivm->builder, result,
 				lp_build_const_int32(gallivm, divisor), "");
 
 	return LLVMBuildAdd(gallivm->builder, result,
 			    LLVMGetParam(radeon_bld->main_fn, param_start_instance), "");
 }
 
+/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
+ * to float. */
+static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
+					    LLVMValueRef vec4,
+					    unsigned double_index)
+{
+	LLVMBuilderRef builder = ctx->gallivm.builder;
+	LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->gallivm.context);
+	LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
+					      LLVMVectorType(f64, 2), "");
+	LLVMValueRef index = LLVMConstInt(ctx->i32, double_index, 0);
+	LLVMValueRef value = LLVMBuildExtractElement(builder, dvec2, index, "");
+	return LLVMBuildFPTrunc(builder, value, ctx->f32, "");
+}
+
 static void declare_input_vs(
 	struct si_shader_context *ctx,
 	unsigned input_index,
 	const struct tgsi_full_declaration *decl,
 	LLVMValueRef out[4])
 {
 	struct lp_build_context *base = &ctx->bld_base.base;
 	struct gallivm_state *gallivm = base->gallivm;
 
 	unsigned chan;
 	unsigned fix_fetch;
+	unsigned num_fetches;
+	unsigned fetch_stride;
 
 	LLVMValueRef t_list_ptr;
 	LLVMValueRef t_offset;
 	LLVMValueRef t_list;
-	LLVMValueRef attribute_offset;
-	LLVMValueRef buffer_index;
+	LLVMValueRef vertex_index;
 	LLVMValueRef args[3];
-	LLVMValueRef input;
+	LLVMValueRef input[3];
 
 	/* Load the T list */
 	t_list_ptr = LLVMGetParam(ctx->main_fn, SI_PARAM_VERTEX_BUFFERS);
 
 	t_offset = lp_build_const_int32(gallivm, input_index);
 
 	t_list = ac_build_indexed_load_const(&ctx->ac, t_list_ptr, t_offset);
 
-	/* Build the attribute offset */
-	attribute_offset = lp_build_const_int32(gallivm, 0);
-
-	buffer_index = LLVMGetParam(ctx->main_fn,
+	vertex_index = LLVMGetParam(ctx->main_fn,
 				    ctx->param_vertex_index0 +
 				    input_index);
 
+	fix_fetch = (ctx->shader->key.mono.vs.fix_fetch >> (4 * input_index)) & 0xf;
+
+	/* Do multiple loads for double formats. */
+	if (fix_fetch == SI_FIX_FETCH_RGB_64_FLOAT) {
+		num_fetches = 3; /* 3 2-dword loads */
+		fetch_stride = 8;
+	} else if (fix_fetch == SI_FIX_FETCH_RGBA_64_FLOAT) {
+		num_fetches = 2; /* 2 4-dword loads */
+		fetch_stride = 16;
+	} else {
+		num_fetches = 1;
+		fetch_stride = 0;
+	}
+
 	args[0] = t_list;
-	args[1] = attribute_offset;
-	args[2] = buffer_index;
-	input = lp_build_intrinsic(gallivm->builder,
-		"llvm.SI.vs.load.input", ctx->v4f32, args, 3,
-		LP_FUNC_ATTR_READNONE);
+	args[2] = vertex_index;
+
+	for (unsigned i = 0; i < num_fetches; i++) {
+		args[1] = LLVMConstInt(ctx->i32, fetch_stride * i, 0);
+
+		input[i] = lp_build_intrinsic(gallivm->builder,
+			"llvm.SI.vs.load.input", ctx->v4f32, args, 3,
+			LP_FUNC_ATTR_READNONE);
+	}
 
 	/* Break up the vec4 into individual components */
 	for (chan = 0; chan < 4; chan++) {
 		LLVMValueRef llvm_chan = lp_build_const_int32(gallivm, chan);
 		out[chan] = LLVMBuildExtractElement(gallivm->builder,
-						    input, llvm_chan, "");
+						    input[0], llvm_chan, "");
 	}
 
-	fix_fetch = (ctx->shader->key.mono.vs.fix_fetch >> (4 * input_index)) & 0xf;
-
 	switch (fix_fetch) {
 	case SI_FIX_FETCH_A2_SNORM:
 	case SI_FIX_FETCH_A2_SSCALED:
 	case SI_FIX_FETCH_A2_SINT: {
 		/* The hardware returns an unsigned value; convert it to a
 		 * signed one.
 		 */
 		LLVMValueRef tmp = out[3];
 		LLVMValueRef c30 = LLVMConstInt(ctx->i32, 30, 0);
 
@@ -465,20 +494,39 @@ static void declare_input_vs(
 		}
 		break;
 	case SI_FIX_FETCH_RGBA_32_SSCALED:
 		for (chan = 0; chan < 4; chan++) {
 			out[chan] = LLVMBuildBitCast(gallivm->builder, out[chan],
 						     ctx->i32, "");
 			out[chan] = LLVMBuildSIToFP(gallivm->builder,
 						    out[chan], ctx->f32, "");
 		}
 		break;
+	case SI_FIX_FETCH_RG_64_FLOAT:
+		for (chan = 0; chan < 2; chan++)
+			out[chan] = extract_double_to_float(ctx, input[0], chan);
+
+		out[2] = LLVMConstReal(ctx->f32, 0);
+		out[3] = LLVMConstReal(ctx->f32, 1);
+		break;
+	case SI_FIX_FETCH_RGB_64_FLOAT:
+		for (chan = 0; chan < 3; chan++)
+			out[chan] = extract_double_to_float(ctx, input[chan], 0);
+
+		out[3] = LLVMConstReal(ctx->f32, 1);
+		break;
+	case SI_FIX_FETCH_RGBA_64_FLOAT:
+		for (chan = 0; chan < 4; chan++) {
+			out[chan] = extract_double_to_float(ctx, input[chan / 2],
+							    chan % 2);
+		}
+		break;
 	}
 }
 
 static LLVMValueRef get_primitive_id(struct lp_build_tgsi_context *bld_base,
 				     unsigned swizzle)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 
 	if (swizzle > 0)
 		return bld_base->uint_bld.zero;
diff --git a/src/gallium/drivers/radeonsi/si_shader.h b/src/gallium/drivers/radeonsi/si_shader.h
index 5464d67..6398b39 100644
--- a/src/gallium/drivers/radeonsi/si_shader.h
+++ b/src/gallium/drivers/radeonsi/si_shader.h
@@ -240,20 +240,24 @@ enum {
 	SI_FIX_FETCH_A2_SSCALED,
 	SI_FIX_FETCH_A2_SINT,
 	SI_FIX_FETCH_RGBA_32_UNORM,
 	SI_FIX_FETCH_RGBX_32_UNORM,
 	SI_FIX_FETCH_RGBA_32_SNORM,
 	SI_FIX_FETCH_RGBX_32_SNORM,
 	SI_FIX_FETCH_RGBA_32_USCALED,
 	SI_FIX_FETCH_RGBA_32_SSCALED,
 	SI_FIX_FETCH_RGBA_32_FIXED,
 	SI_FIX_FETCH_RGBX_32_FIXED,
+	SI_FIX_FETCH_RG_64_FLOAT,
+	SI_FIX_FETCH_RGB_64_FLOAT,
+	SI_FIX_FETCH_RGBA_64_FLOAT,
+	SI_FIX_FETCH_RESERVED_15, /* maximum */
 };
 
 struct si_shader;
 
 /* State of the context creating the shader object. */
 struct si_compiler_ctx_state {
 	/* Should only be used by si_init_shader_selector_async and
 	 * si_build_shader_variant if thread_index == -1 (non-threaded). */
 	LLVMTargetMachineRef		tm;
 
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index 1e0729c..107bc06 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -1755,20 +1755,33 @@ static uint32_t si_translate_buffer_dataformat(struct pipe_screen *screen,
 		case 1:
 			return V_008F0C_BUF_DATA_FORMAT_32;
 		case 2:
 			return V_008F0C_BUF_DATA_FORMAT_32_32;
 		case 3:
 			return V_008F0C_BUF_DATA_FORMAT_32_32_32;
 		case 4:
 			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
 		}
 		break;
+	case 64:
+		/* Legacy double formats. */
+		switch (desc->nr_channels) {
+		case 1: /* 1 load */
+			return V_008F0C_BUF_DATA_FORMAT_32_32;
+		case 2: /* 1 load */
+			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+		case 3: /* 3 loads */
+			return V_008F0C_BUF_DATA_FORMAT_32_32;
+		case 4: /* 2 loads */
+			return V_008F0C_BUF_DATA_FORMAT_32_32_32_32;
+		}
+		break;
 	}
 
 	return V_008F0C_BUF_DATA_FORMAT_INVALID;
 }
 
 static uint32_t si_translate_buffer_numformat(struct pipe_screen *screen,
 					      const struct util_format_description *desc,
 					      int first_non_void)
 {
 	if (desc->format == PIPE_FORMAT_R11G11B10_FLOAT)
@@ -3352,43 +3365,39 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 
 	v->count = count;
 	v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT);
 
 	for (i = 0; i < count; ++i) {
 		const struct util_format_description *desc;
 		const struct util_format_channel_description *channel;
 		unsigned data_format, num_format;
 		int first_non_void;
 		unsigned vbo_index = elements[i].vertex_buffer_index;
+		unsigned char swizzle[4];
 
 		if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
 			FREE(v);
 			return NULL;
 		}
 
 		if (!used[vbo_index]) {
 			v->first_vb_use_mask |= 1 << i;
 			used[vbo_index] = true;
 		}
 
 		desc = util_format_description(elements[i].src_format);
 		first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
 		data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
 		num_format = si_translate_buffer_numformat(ctx->screen, desc, first_non_void);
 		channel = first_non_void >= 0 ? &desc->channel[first_non_void] : NULL;
+		memcpy(swizzle, desc->swizzle, sizeof(swizzle));
 
-		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(desc->swizzle[0])) |
-				   S_008F0C_DST_SEL_Y(si_map_swizzle(desc->swizzle[1])) |
-				   S_008F0C_DST_SEL_Z(si_map_swizzle(desc->swizzle[2])) |
-				   S_008F0C_DST_SEL_W(si_map_swizzle(desc->swizzle[3])) |
-				   S_008F0C_NUM_FORMAT(num_format) |
-				   S_008F0C_DATA_FORMAT(data_format);
 		v->format_size[i] = desc->block.bits / 8;
 
 		/* The hardware always treats the 2-bit alpha channel as
 		 * unsigned, so a shader workaround is needed.
 		 */
 		if (data_format == V_008F0C_BUF_DATA_FORMAT_2_10_10_10) {
 			if (num_format == V_008F0C_BUF_NUM_FORMAT_SNORM) {
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_A2_SNORM << (4 * i);
 			} else if (num_format == V_008F0C_BUF_NUM_FORMAT_SSCALED) {
 				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_A2_SSCALED << (4 * i);
@@ -3414,22 +3423,57 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
 			} else if (channel->type == UTIL_FORMAT_TYPE_UNSIGNED) {
 				if (channel->normalized) {
 					if (desc->swizzle[3] == PIPE_SWIZZLE_1)
 						v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBX_32_UNORM << (4 * i);
 					else
 						v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBA_32_UNORM << (4 * i);
 				} else {
 					v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBA_32_USCALED << (4 * i);
 				}
 			}
+		} else if (channel && channel->size == 64 &&
+			   channel->type == UTIL_FORMAT_TYPE_FLOAT) {
+			switch (desc->nr_channels) {
+			case 1:
+			case 2:
+				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RG_64_FLOAT << (4 * i);
+				swizzle[0] = PIPE_SWIZZLE_X;
+				swizzle[1] = PIPE_SWIZZLE_Y;
+				swizzle[2] = desc->nr_channels == 2 ? PIPE_SWIZZLE_Z : PIPE_SWIZZLE_0;
+				swizzle[3] = desc->nr_channels == 2 ? PIPE_SWIZZLE_W : PIPE_SWIZZLE_0;
+				break;
+			case 3:
+				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGB_64_FLOAT << (4 * i);
+				swizzle[0] = PIPE_SWIZZLE_X; /* 3 loads */
+				swizzle[1] = PIPE_SWIZZLE_Y;
+				swizzle[2] = PIPE_SWIZZLE_0;
+				swizzle[3] = PIPE_SWIZZLE_0;
+				break;
+			case 4:
+				v->fix_fetch |= (uint64_t)SI_FIX_FETCH_RGBA_64_FLOAT << (4 * i);
+				swizzle[0] = PIPE_SWIZZLE_X; /* 2 loads */
+				swizzle[1] = PIPE_SWIZZLE_Y;
+				swizzle[2] = PIPE_SWIZZLE_Z;
+				swizzle[3] = PIPE_SWIZZLE_W;
+				break;
+			default:
+				assert(0);
+			}
 		}
 
+		v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
+				   S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
+				   S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
+				   S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
+				   S_008F0C_NUM_FORMAT(num_format) |
+				   S_008F0C_DATA_FORMAT(data_format);
+
 		/* We work around the fact that 8_8_8 and 16_16_16 data formats
 		 * do not exist by using the corresponding 4-component formats.
 		 * This requires a fixup of the descriptor for bounds checks.
 		 */
 		if (desc->block.bits == 3 * 8 ||
 		    desc->block.bits == 3 * 16) {
 			v->fix_size3 |= (desc->block.bits / 24) << (2 * i);
 		}
 	}
 	memcpy(v->elements, elements, sizeof(struct pipe_vertex_element) * count);
-- 
2.7.4