[Mesa-dev] [PATCH] radv: move to using nir clip/cull merge pass.

Wed Mar 29 05:14:51 UTC 2017

From: Dave Airlie <airlied at redhat.com>

Doing this before tessellation makes doing some bits of
tessellation a bit cleaner. It also cleans up a bit of the
llvm generator code.

Signed-off-by: Dave Airlie <airlied at redhat.com>
---
 src/amd/common/ac_nir_to_llvm.c | 144 ++++++++++------------------------------
 src/amd/vulkan/radv_pipeline.c  |   1 +
 2 files changed, 36 insertions(+), 109 deletions(-)

diff --git a/src/amd/common/ac_nir_to_llvm.c b/src/amd/common/ac_nir_to_llvm.c
index f164d8f..78602fd 100644
--- a/src/amd/common/ac_nir_to_llvm.c
+++ b/src/amd/common/ac_nir_to_llvm.c
@@ -144,8 +144,6 @@ struct nir_to_llvm_context {
 	int num_locals;
 	LLVMValueRef *locals;
 	bool has_ddxy;
-	uint8_t num_input_clips;
-	uint8_t num_input_culls;
 	uint8_t num_output_clips;
 	uint8_t num_output_culls;
 
@@ -170,12 +168,9 @@ static unsigned shader_io_get_unique_index(gl_varying_slot slot)
 		return 0;
 	if (slot == VARYING_SLOT_PSIZ)
 		return 1;
-	if (slot == VARYING_SLOT_CLIP_DIST0 ||
-	    slot == VARYING_SLOT_CULL_DIST0)
+	if (slot == VARYING_SLOT_CLIP_DIST0)
 		return 2;
-	if (slot == VARYING_SLOT_CLIP_DIST1 ||
-	    slot == VARYING_SLOT_CULL_DIST1)
-		return 3;
+	/* 3 is reserved for clip dist as well */
 	if (slot >= VARYING_SLOT_VAR0 && slot <= VARYING_SLOT_VAR31)
 		return 4 + (slot - VARYING_SLOT_VAR0);
 	unreachable("illegal slot in get unique index\n");
@@ -2195,7 +2190,6 @@ load_gs_input(struct nir_to_llvm_context *ctx,
 	unsigned param, vtx_offset_param;
 	LLVMValueRef value[4], result;
 	unsigned vertex_index;
-	unsigned cull_offset = 0;
 	radv_get_deref_offset(ctx, &instr->variables[0]->deref,
 			      false, &vertex_index,
 			      &const_index, &indir_index);
@@ -2205,13 +2199,11 @@ load_gs_input(struct nir_to_llvm_context *ctx,
 				  LLVMConstInt(ctx->i32, 4, false), "");
 
 	param = shader_io_get_unique_index(instr->variables[0]->var->data.location);
-	if (instr->variables[0]->var->data.location == VARYING_SLOT_CULL_DIST0)
-		cull_offset += ctx->num_input_clips;
 	for (unsigned i = 0; i < instr->num_components; i++) {
 
 		args[0] = ctx->esgs_ring;
 		args[1] = vtx_offset;
-		args[2] = LLVMConstInt(ctx->i32, (param * 4 + i + const_index + cull_offset) * 256, false);
+		args[2] = LLVMConstInt(ctx->i32, (param * 4 + i + const_index) * 256, false);
 		args[3] = ctx->i32zero;
 		args[4] = ctx->i32one; /* OFFEN */
 		args[5] = ctx->i32zero; /* IDXEN */
@@ -2366,8 +2358,7 @@ visit_store_var(struct nir_to_llvm_context *ctx,
 
 			value = llvm_extract_elem(ctx, src, chan);
 
-			if (instr->variables[0]->var->data.location == VARYING_SLOT_CLIP_DIST0 ||
-			    instr->variables[0]->var->data.location == VARYING_SLOT_CULL_DIST0)
+			if (instr->variables[0]->var->data.compact)
 				stride = 1;
 			if (indir_index) {
 				unsigned count = glsl_count_attribute_slots(
@@ -3143,7 +3134,7 @@ visit_emit_vertex(struct nir_to_llvm_context *ctx,
 	LLVMValueRef gs_next_vertex;
 	LLVMValueRef can_emit, kill;
 	int idx;
-	int clip_cull_slot = -1;
+
 	assert(instr->const_index[0] == 0);
 	/* Write vertex attribute values to GSVS ring */
 	gs_next_vertex = LLVMBuildLoad(ctx->builder,
@@ -3175,27 +3166,11 @@ visit_emit_vertex(struct nir_to_llvm_context *ctx,
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (i == VARYING_SLOT_CLIP_DIST1 ||
-		    i == VARYING_SLOT_CULL_DIST1)
-			continue;
-
-		if (i == VARYING_SLOT_CLIP_DIST0 ||
-		    i == VARYING_SLOT_CULL_DIST0) {
+		if (i == VARYING_SLOT_CLIP_DIST0) {
 			/* pack clip and cull into a single set of slots */
-			if (clip_cull_slot == -1) {
-				clip_cull_slot = idx;
-				if (ctx->num_output_clips + ctx->num_output_culls > 4)
-					slot_inc = 2;
-			} else {
-				slot = clip_cull_slot;
-				slot_inc = 0;
-			}
-			if (i == VARYING_SLOT_CLIP_DIST0)
-				length = ctx->num_output_clips;
-			if (i == VARYING_SLOT_CULL_DIST0) {
-				start = ctx->num_output_clips;
-				length = ctx->num_output_culls;
-			}
+			length = ctx->num_output_clips + ctx->num_output_culls;
+			if (length > 4)
+				slot_inc = 2;
 		}
 		for (unsigned j = 0; j < length; j++) {
 			LLVMValueRef out_val = LLVMBuildLoad(ctx->builder,
@@ -4083,22 +4058,6 @@ handle_vs_input_decl(struct nir_to_llvm_context *ctx,
 	}
 }
 
-static void
-handle_gs_input_decl(struct nir_to_llvm_context *ctx,
-		     struct nir_variable *variable)
-{
-	int idx = variable->data.location;
-
-	if (idx == VARYING_SLOT_CLIP_DIST0 ||
-	    idx == VARYING_SLOT_CULL_DIST0) {
-		int length = glsl_get_length(glsl_get_array_element(variable->type));
-		if (idx == VARYING_SLOT_CLIP_DIST0)
-			ctx->num_input_clips = length;
-		else
-			ctx->num_input_culls = length;
-	}
-}
-
 static void interp_fs_input(struct nir_to_llvm_context *ctx,
 			    unsigned attr,
 			    LLVMValueRef interp_param,
@@ -4191,9 +4150,6 @@ handle_shader_input_decl(struct nir_to_llvm_context *ctx,
 	case MESA_SHADER_FRAGMENT:
 		handle_fs_input_decl(ctx, variable);
 		break;
-	case MESA_SHADER_GEOMETRY:
-		handle_gs_input_decl(ctx, variable);
-		break;
 	default:
 		break;
 	}
@@ -4276,33 +4232,33 @@ static LLVMValueRef si_build_alloca_undef(struct nir_to_llvm_context *ctx,
 
 static void
 handle_shader_output_decl(struct nir_to_llvm_context *ctx,
+			  struct nir_shader *nir,
 			  struct nir_variable *variable)
 {
 	int idx = variable->data.location + variable->data.index;
 	unsigned attrib_count = glsl_count_attribute_slots(variable->type, false);
-
+	unsigned mask_attribs;
 	variable->data.driver_location = idx * 4;
 
 	if (ctx->stage == MESA_SHADER_VERTEX ||
 	    ctx->stage == MESA_SHADER_GEOMETRY) {
-		if (idx == VARYING_SLOT_CLIP_DIST0 ||
-		    idx == VARYING_SLOT_CULL_DIST0) {
+		if (idx == VARYING_SLOT_CLIP_DIST0) {
 			int length = glsl_get_length(variable->type);
-			if (idx == VARYING_SLOT_CLIP_DIST0) {
-				if (ctx->stage == MESA_SHADER_VERTEX)
-					ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << length) - 1;
-				ctx->num_output_clips = length;
-			} else if (idx == VARYING_SLOT_CULL_DIST0) {
-				if (ctx->stage == MESA_SHADER_VERTEX)
-					ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << length) - 1;
-				ctx->num_output_culls = length;
+			if (ctx->stage == MESA_SHADER_VERTEX) {
+				ctx->shader_info->vs.outinfo.clip_dist_mask = (1 << nir->info->clip_distance_array_size) - 1;
+				ctx->shader_info->vs.outinfo.cull_dist_mask = (1 << nir->info->cull_distance_array_size) - 1;
 			}
+			ctx->num_output_clips = nir->info->clip_distance_array_size;
+			ctx->num_output_culls = nir->info->cull_distance_array_size;
+
 			if (length > 4)
 				attrib_count = 2;
 			else
 				attrib_count = 1;
 		}
-	}
+		mask_attribs = ((1ull << 1) - 1) << idx;
+	} else
+		mask_attribs = ((1ull << attrib_count) - 1) << idx;
 
 	for (unsigned i = 0; i < attrib_count; ++i) {
 		for (unsigned chan = 0; chan < 4; chan++) {
@@ -4310,7 +4266,7 @@ handle_shader_output_decl(struct nir_to_llvm_context *ctx,
 		                       si_build_alloca_undef(ctx, ctx->f32, "");
 		}
 	}
-	ctx->output_mask |= ((1ull << attrib_count) - 1) << idx;
+	ctx->output_mask |= mask_attribs;
 }
 
 static void
@@ -4519,14 +4475,10 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
 	struct ac_export_args args, pos_args[4] = {};
 	LLVMValueRef psize_value = NULL, layer_value = NULL, viewport_index_value = NULL;
 	int i;
-	const uint64_t clip_mask = ctx->output_mask & ((1ull << VARYING_SLOT_CLIP_DIST0) |
-						       (1ull << VARYING_SLOT_CLIP_DIST1) |
-						       (1ull << VARYING_SLOT_CULL_DIST0) |
-						       (1ull << VARYING_SLOT_CULL_DIST1));
 
 	outinfo->prim_id_output = 0xffffffff;
 	outinfo->layer_output = 0xffffffff;
-	if (clip_mask) {
+	if (ctx->output_mask & (1ull << VARYING_SLOT_CLIP_DIST0)) {
 		LLVMValueRef slots[8];
 		unsigned j;
 
@@ -4534,13 +4486,9 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
 			outinfo->cull_dist_mask <<= ctx->num_output_clips;
 
 		i = VARYING_SLOT_CLIP_DIST0;
-		for (j = 0; j < ctx->num_output_clips; j++)
+		for (j = 0; j < ctx->num_output_clips + ctx->num_output_culls; j++)
 			slots[j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
 							       ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
-		i = VARYING_SLOT_CULL_DIST0;
-		for (j = 0; j < ctx->num_output_culls; j++)
-			slots[ctx->num_output_clips + j] = to_float(ctx, LLVMBuildLoad(ctx->builder,
-									   ctx->outputs[radeon_llvm_reg_index_soa(i, j)], ""));
 
 		for (i = ctx->num_output_clips + ctx->num_output_culls; i < 8; i++)
 			slots[i] = LLVMGetUndef(ctx->f32);
@@ -4570,10 +4518,7 @@ handle_vs_outputs_post(struct nir_to_llvm_context *ctx,
 
 		if (i == VARYING_SLOT_POS) {
 			target = V_008DFC_SQ_EXP_POS;
-		} else if (i == VARYING_SLOT_CLIP_DIST0 ||
-			   i == VARYING_SLOT_CLIP_DIST1 ||
-			   i == VARYING_SLOT_CULL_DIST0 ||
-			   i == VARYING_SLOT_CULL_DIST1) {
+		} else if (i == VARYING_SLOT_CLIP_DIST0) {
 			continue;
 		} else if (i == VARYING_SLOT_PSIZ) {
 			outinfo->writes_pointsize = true;
@@ -4679,12 +4624,9 @@ handle_es_outputs_post(struct nir_to_llvm_context *ctx,
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (i == VARYING_SLOT_CLIP_DIST0) {
-			length = ctx->num_output_clips;
-		} else if (i == VARYING_SLOT_CULL_DIST0) {
-			start = ctx->num_output_clips;
-			length = ctx->num_output_culls;
-		}
+		if (i == VARYING_SLOT_CLIP_DIST0)
+			length = ctx->num_output_clips + ctx->num_output_culls;
+
 		param_index = shader_io_get_unique_index(i);
 
 		if (param_index > max_output_written)
@@ -4980,7 +4922,7 @@ LLVMModuleRef ac_translate_nir_to_llvm(LLVMTargetMachineRef tm,
 		handle_fs_inputs_pre(&ctx, nir);
 
 	nir_foreach_variable(variable, &nir->outputs)
-		handle_shader_output_decl(&ctx, variable);
+		handle_shader_output_decl(&ctx, nir, variable);
 
 	ctx.defs = _mesa_hash_table_create(NULL, _mesa_hash_pointer,
 	                                   _mesa_key_pointer_equal);
@@ -5185,7 +5127,7 @@ ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
 	args[8] = ctx->i32zero; /* TFE */
 
 	int idx = 0;
-	int clip_cull_slot = -1;
+
 	for (unsigned i = 0; i < RADEON_LLVM_MAX_OUTPUTS; ++i) {
 		int length = 4;
 		int start = 0;
@@ -5194,27 +5136,11 @@ ac_gs_copy_shader_emit(struct nir_to_llvm_context *ctx)
 		if (!(ctx->output_mask & (1ull << i)))
 			continue;
 
-		if (i == VARYING_SLOT_CLIP_DIST1 ||
-		    i == VARYING_SLOT_CULL_DIST1)
-			continue;
-
-		if (i == VARYING_SLOT_CLIP_DIST0 ||
-		    i == VARYING_SLOT_CULL_DIST0) {
+		if (i == VARYING_SLOT_CLIP_DIST0) {
 			/* unpack clip and cull from a single set of slots */
-			if (clip_cull_slot == -1) {
-				clip_cull_slot = idx;
-				if (ctx->num_output_clips + ctx->num_output_culls > 4)
-					slot_inc = 2;
-			} else {
-				slot = clip_cull_slot;
-				slot_inc = 0;
-			}
-			if (i == VARYING_SLOT_CLIP_DIST0)
-				length = ctx->num_output_clips;
-			if (i == VARYING_SLOT_CULL_DIST0) {
-				start = ctx->num_output_clips;
-				length = ctx->num_output_culls;
-			}
+			length = ctx->num_output_clips + ctx->num_output_culls;
+			if (length > 4)
+				slot_inc = 2;
 		}
 
 		for (unsigned j = 0; j < length; j++) {
@@ -5268,7 +5194,7 @@ void ac_create_gs_copy_shader(LLVMTargetMachineRef tm,
 	ac_setup_rings(&ctx);
 
 	nir_foreach_variable(variable, &geom_shader->outputs)
-		handle_shader_output_decl(&ctx, variable);
+		handle_shader_output_decl(&ctx, geom_shader, variable);
 
 	ac_gs_copy_shader_emit(&ctx);
 
diff --git a/src/amd/vulkan/radv_pipeline.c b/src/amd/vulkan/radv_pipeline.c
index 07020e8..df46276 100644
--- a/src/amd/vulkan/radv_pipeline.c
+++ b/src/amd/vulkan/radv_pipeline.c
@@ -246,6 +246,7 @@ radv_shader_compile_to_nir(struct radv_device *device,
 		 */
 		NIR_PASS_V(nir, nir_lower_constant_initializers, ~0);
 		NIR_PASS_V(nir, nir_lower_system_values);
+		NIR_PASS_V(nir, nir_lower_clip_cull_distance_arrays);
 	}
 
 	/* Vulkan uses the separate-shader linking model */
-- 
2.9.3