[Mesa-dev] [PATCH 09/15] radeonsi: fix GPU hangs with bindless textures and LLVM 7.0

Marek Olšák maraeo at gmail.com
Wed Aug 29 20:13:05 UTC 2018


From: Marek Olšák <marek.olsak at amd.com>

---
 src/amd/common/ac_llvm_build.c                | 52 +++++++++++++++++--
 src/amd/common/ac_llvm_build.h                |  4 ++
 .../drivers/radeonsi/si_shader_internal.h     |  3 +-
 src/gallium/drivers/radeonsi/si_shader_nir.c  | 12 ++++-
 .../drivers/radeonsi/si_shader_tgsi_mem.c     | 23 ++++++--
 5 files changed, 84 insertions(+), 10 deletions(-)

diff --git a/src/amd/common/ac_llvm_build.c b/src/amd/common/ac_llvm_build.c
index 1c8d944db74..1f5112e9929 100644
--- a/src/amd/common/ac_llvm_build.c
+++ b/src/amd/common/ac_llvm_build.c
@@ -828,70 +828,112 @@ ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
 	      LLVMValueRef index)
 {
 	LLVMValueRef indices[2] = {
 		ctx->i32_0,
 		index,
 	};
 	return LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
 }
 
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				  LLVMValueRef index)
+{
+	return LLVMBuildPointerCast(ctx->builder,
+				    ac_build_gep0(ctx, ptr, index),
+				    LLVMTypeOf(ptr), "");
+}
+
 void
 ac_build_indexed_store(struct ac_llvm_context *ctx,
 		       LLVMValueRef base_ptr, LLVMValueRef index,
 		       LLVMValueRef value)
 {
 	LLVMBuildStore(ctx->builder, value,
 		       ac_build_gep0(ctx, base_ptr, index));
 }
 
 /**
  * Build an LLVM bytecode indexed load using LLVMBuildGEP + LLVMBuildLoad.
  * It's equivalent to doing a load from &base_ptr[index].
  *
  * \param base_ptr  Where the array starts.
  * \param index     The element index into the array.
  * \param uniform   Whether the base_ptr and index can be assumed to be
  *                  dynamically uniform (i.e. load to an SGPR)
  * \param invariant Whether the load is invariant (no other opcodes affect it)
+ * \param no_unsigned_wraparound
+ *    For all possible re-associations and re-distributions of an expression
+ *    "base_ptr + index * elemsize" into "addr + offset" (excluding GEPs
+ *    without inbounds in base_ptr), this parameter is true if "addr + offset"
+ *    does not result in an unsigned integer wraparound. This is used for
+ *    optimal code generation of 32-bit pointer arithmetic.
+ *
+ *    For example, a 32-bit immediate offset that causes a 32-bit unsigned
+ *    integer wraparound can't be an imm offset in s_load_dword, because
+ *    the instruction performs "addr + offset" in 64 bits.
+ *
+ *    Expected usage for bindless textures by chaining GEPs:
+ *      // possible unsigned wraparound, don't use InBounds:
+ *      ptr1 = LLVMBuildGEP(base_ptr, index);
+ *      image = load(ptr1); // becomes "s_load ptr1, 0"
+ *
+ *      ptr2 = LLVMBuildInBoundsGEP(ptr1, 32 / elemsize);
+ *      sampler = load(ptr2); // becomes "s_load ptr1, 32" thanks to InBounds
  */
 static LLVMValueRef
 ac_build_load_custom(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
-		     LLVMValueRef index, bool uniform, bool invariant)
+		     LLVMValueRef index, bool uniform, bool invariant,
+		     bool no_unsigned_wraparound)
 {
 	LLVMValueRef pointer, result;
+	LLVMValueRef indices[2] = {ctx->i32_0, index};
+
+	if (no_unsigned_wraparound &&
+	    LLVMGetPointerAddressSpace(LLVMTypeOf(base_ptr)) == AC_CONST_32BIT_ADDR_SPACE)
+		pointer = LLVMBuildInBoundsGEP(ctx->builder, base_ptr, indices, 2, "");
+	else
+		pointer = LLVMBuildGEP(ctx->builder, base_ptr, indices, 2, "");
 
-	pointer = ac_build_gep0(ctx, base_ptr, index);
 	if (uniform)
 		LLVMSetMetadata(pointer, ctx->uniform_md_kind, ctx->empty_md);
 	result = LLVMBuildLoad(ctx->builder, pointer, "");
 	if (invariant)
 		LLVMSetMetadata(result, ctx->invariant_load_md_kind, ctx->empty_md);
 	return result;
 }
 
 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
 			   LLVMValueRef index)
 {
-	return ac_build_load_custom(ctx, base_ptr, index, false, false);
+	return ac_build_load_custom(ctx, base_ptr, index, false, false, false);
 }
 
 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
 				     LLVMValueRef base_ptr, LLVMValueRef index)
 {
-	return ac_build_load_custom(ctx, base_ptr, index, false, true);
+	return ac_build_load_custom(ctx, base_ptr, index, false, true, false);
 }
 
+/* This assumes that there is no unsigned integer wraparound during the address
+ * computation, excluding all GEPs within base_ptr. */
 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
 				   LLVMValueRef base_ptr, LLVMValueRef index)
 {
-	return ac_build_load_custom(ctx, base_ptr, index, true, true);
+	return ac_build_load_custom(ctx, base_ptr, index, true, true, true);
+}
+
+/* See ac_build_load_custom() documentation. */
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index)
+{
+	return ac_build_load_custom(ctx, base_ptr, index, true, true, false);
 }
 
 /* TBUFFER_STORE_FORMAT_{X,XY,XYZ,XYZW} <- the suffix is selected by num_channels=1..4.
  * The type of vdata must be one of i32 (num_channels=1), v2i32 (num_channels=2),
  * or v4i32 (num_channels=3,4).
  */
 void
 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 			    LLVMValueRef rsrc,
 			    LLVMValueRef vdata,
diff --git a/src/amd/common/ac_llvm_build.h b/src/amd/common/ac_llvm_build.h
index b080cca4cb7..0d261bae097 100644
--- a/src/amd/common/ac_llvm_build.h
+++ b/src/amd/common/ac_llvm_build.h
@@ -196,32 +196,36 @@ LLVMValueRef
 ac_build_fs_interp_mov(struct ac_llvm_context *ctx,
 		       LLVMValueRef parameter,
 		       LLVMValueRef llvm_chan,
 		       LLVMValueRef attr_number,
 		       LLVMValueRef params);
 
 LLVMValueRef
 ac_build_gep0(struct ac_llvm_context *ctx,
 	      LLVMValueRef base_ptr,
 	      LLVMValueRef index);
+LLVMValueRef ac_build_pointer_add(struct ac_llvm_context *ctx, LLVMValueRef ptr,
+				  LLVMValueRef index);
 
 void
 ac_build_indexed_store(struct ac_llvm_context *ctx,
 		       LLVMValueRef base_ptr, LLVMValueRef index,
 		       LLVMValueRef value);
 
 LLVMValueRef ac_build_load(struct ac_llvm_context *ctx, LLVMValueRef base_ptr,
 			   LLVMValueRef index);
 LLVMValueRef ac_build_load_invariant(struct ac_llvm_context *ctx,
 				     LLVMValueRef base_ptr, LLVMValueRef index);
 LLVMValueRef ac_build_load_to_sgpr(struct ac_llvm_context *ctx,
 				   LLVMValueRef base_ptr, LLVMValueRef index);
+LLVMValueRef ac_build_load_to_sgpr_uint_wraparound(struct ac_llvm_context *ctx,
+				   LLVMValueRef base_ptr, LLVMValueRef index);
 
 void
 ac_build_buffer_store_dword(struct ac_llvm_context *ctx,
 			    LLVMValueRef rsrc,
 			    LLVMValueRef vdata,
 			    unsigned num_channels,
 			    LLVMValueRef voffset,
 			    LLVMValueRef soffset,
 			    unsigned inst_offset,
 		            bool glc,
diff --git a/src/gallium/drivers/radeonsi/si_shader_internal.h b/src/gallium/drivers/radeonsi/si_shader_internal.h
index a638dbf8f1a..235c46ecf92 100644
--- a/src/gallium/drivers/radeonsi/si_shader_internal.h
+++ b/src/gallium/drivers/radeonsi/si_shader_internal.h
@@ -298,21 +298,22 @@ LLVMValueRef si_get_bounded_indirect_index(struct si_shader_context *ctx,
 LLVMValueRef si_get_sample_id(struct si_shader_context *ctx);
 
 void si_shader_context_init_alu(struct lp_build_tgsi_context *bld_base);
 void si_shader_context_init_mem(struct si_shader_context *ctx);
 
 LLVMValueRef si_load_sampler_desc(struct si_shader_context *ctx,
 				  LLVMValueRef list, LLVMValueRef index,
 				  enum ac_descriptor_type type);
 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 				LLVMValueRef list, LLVMValueRef index,
-				enum ac_descriptor_type desc_type, bool dcc_off);
+				enum ac_descriptor_type desc_type, bool dcc_off,
+				bool bindless);
 
 void si_load_system_value(struct si_shader_context *ctx,
 			  unsigned index,
 			  const struct tgsi_full_declaration *decl);
 void si_declare_compute_memory(struct si_shader_context *ctx);
 void si_tgsi_declare_compute_memory(struct si_shader_context *ctx,
 				    const struct tgsi_full_declaration *decl);
 
 void si_llvm_load_input_vs(
 	struct si_shader_context *ctx,
diff --git a/src/gallium/drivers/radeonsi/si_shader_nir.c b/src/gallium/drivers/radeonsi/si_shader_nir.c
index 0aefca22385..5d6280b80f7 100644
--- a/src/gallium/drivers/radeonsi/si_shader_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shader_nir.c
@@ -913,31 +913,41 @@ si_nir_load_sampler_desc(struct ac_shader_abi *abi,
 		assert(base_index + constant_index < ctx->num_images);
 
 		if (dynamic_index)
 			index = si_llvm_bound_index(ctx, index, ctx->num_images);
 
 		index = LLVMBuildSub(ctx->ac.builder,
 				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 				     index, "");
 
 		/* TODO: be smarter about when we use dcc_off */
-		return si_load_image_desc(ctx, list, index, desc_type, write);
+		return si_load_image_desc(ctx, list, index, desc_type, write, bindless);
 	}
 
 	assert(base_index + constant_index < ctx->num_samplers);
 
 	if (dynamic_index)
 		index = si_llvm_bound_index(ctx, index, ctx->num_samplers);
 
 	index = LLVMBuildAdd(ctx->ac.builder, index,
 			     LLVMConstInt(ctx->i32, SI_NUM_IMAGES / 2, 0), "");
 
+	if (bindless) {
+		/* Since bindless handle arithmetic can contain an unsigned integer
+		 * wraparound and si_load_sampler_desc assumes there isn't any,
+		 * use GEP without "inbounds" (inside ac_build_pointer_add)
+		 * to prevent incorrect code generation and hangs.
+		 */
+		index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		list = ac_build_pointer_add(&ctx->ac, list, index);
+		index = ctx->i32_0;
+	}
 	return si_load_sampler_desc(ctx, list, index, desc_type);
 }
 
 static void bitcast_inputs(struct si_shader_context *ctx,
 			   LLVMValueRef data[4],
 			   unsigned input_idx)
 {
 	for (unsigned chan = 0; chan < 4; chan++) {
 		ctx->inputs[input_idx + chan] =
 			LLVMBuildBitCast(ctx->ac.builder, data[chan], ctx->ac.i32, "");
diff --git a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
index eaa200a95d6..cabc448a082 100644
--- a/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
+++ b/src/gallium/drivers/radeonsi/si_shader_tgsi_mem.c
@@ -169,35 +169,40 @@ static LLVMValueRef force_dcc_off(struct si_shader_context *ctx,
 		LLVMValueRef tmp;
 
 		tmp = LLVMBuildExtractElement(ctx->ac.builder, rsrc, i32_6, "");
 		tmp = LLVMBuildAnd(ctx->ac.builder, tmp, i32_C, "");
 		return LLVMBuildInsertElement(ctx->ac.builder, rsrc, tmp, i32_6, "");
 	}
 }
 
 LLVMValueRef si_load_image_desc(struct si_shader_context *ctx,
 				LLVMValueRef list, LLVMValueRef index,
-				enum ac_descriptor_type desc_type, bool dcc_off)
+				enum ac_descriptor_type desc_type, bool dcc_off,
+				bool bindless)
 {
 	LLVMBuilderRef builder = ctx->ac.builder;
 	LLVMValueRef rsrc;
 
 	if (desc_type == AC_DESC_BUFFER) {
 		index = ac_build_imad(&ctx->ac, index, LLVMConstInt(ctx->i32, 2, 0),
 				      ctx->i32_1);
 		list = LLVMBuildPointerCast(builder, list,
 					    ac_array_in_const32_addr_space(ctx->v4i32), "");
 	} else {
 		assert(desc_type == AC_DESC_IMAGE);
 	}
 
-	rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+	if (bindless)
+		rsrc = ac_build_load_to_sgpr_uint_wraparound(&ctx->ac, list, index);
+	else
+		rsrc = ac_build_load_to_sgpr(&ctx->ac, list, index);
+
 	if (desc_type == AC_DESC_IMAGE && dcc_off)
 		rsrc = force_dcc_off(ctx, rsrc);
 	return rsrc;
 }
 
 /**
  * Load the resource descriptor for \p image.
  */
 static void
 image_fetch_rsrc(
@@ -233,39 +238,42 @@ image_fetch_rsrc(
 		 *    not lead to termination.
 		 */
 		index = si_get_bounded_indirect_index(ctx, &image->Indirect,
 						      image->Register.Index,
 						      ctx->num_images);
 		index = LLVMBuildSub(ctx->ac.builder,
 				     LLVMConstInt(ctx->i32, SI_NUM_IMAGES - 1, 0),
 				     index, "");
 	}
 
+	bool bindless = false;
+
 	if (image->Register.File != TGSI_FILE_IMAGE) {
 		/* Bindless descriptors are accessible from a different pair of
 		 * user SGPR indices.
 		 */
 		rsrc_ptr = LLVMGetParam(ctx->main_fn,
 					ctx->param_bindless_samplers_and_images);
 		index = lp_build_emit_fetch_src(bld_base, image,
 						TGSI_TYPE_UNSIGNED, 0);
 
 		/* For simplicity, bindless image descriptors use fixed
 		 * 16-dword slots for now.
 		 */
 		index = LLVMBuildMul(ctx->ac.builder, index,
 				     LLVMConstInt(ctx->i32, 2, 0), "");
+		bindless = true;
 	}
 
 	*rsrc = si_load_image_desc(ctx, rsrc_ptr, index,
 				   target == TGSI_TEXTURE_BUFFER ? AC_DESC_BUFFER : AC_DESC_IMAGE,
-				   dcc_off);
+				   dcc_off, bindless);
 }
 
 static void image_fetch_coords(
 		struct lp_build_tgsi_context *bld_base,
 		const struct tgsi_full_instruction *inst,
 		unsigned src, LLVMValueRef desc,
 		LLVMValueRef *coords)
 {
 	struct si_shader_context *ctx = si_shader_context(bld_base);
 	LLVMBuilderRef builder = ctx->ac.builder;
@@ -1061,20 +1069,29 @@ static void tex_fetch_ptrs(struct lp_build_tgsi_context *bld_base,
 	}
 
 	if (reg->Register.File != TGSI_FILE_SAMPLER) {
 		/* Bindless descriptors are accessible from a different pair of
 		 * user SGPR indices.
 		 */
 		list = LLVMGetParam(ctx->main_fn,
 				    ctx->param_bindless_samplers_and_images);
 		index = lp_build_emit_fetch_src(bld_base, reg,
 						TGSI_TYPE_UNSIGNED, 0);
+
+		/* Since bindless handle arithmetic can contain an unsigned integer
+		 * wraparound and si_load_sampler_desc assumes there isn't any,
+		 * use GEP without "inbounds" (inside ac_build_pointer_add)
+		 * to prevent incorrect code generation and hangs.
+		 */
+		index = LLVMBuildMul(ctx->ac.builder, index, LLVMConstInt(ctx->i32, 2, 0), "");
+		list = ac_build_pointer_add(&ctx->ac, list, index);
+		index = ctx->i32_0;
 	}
 
 	if (target == TGSI_TEXTURE_BUFFER)
 		*res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_BUFFER);
 	else
 		*res_ptr = si_load_sampler_desc(ctx, list, index, AC_DESC_IMAGE);
 
 	if (samp_ptr)
 		*samp_ptr = NULL;
 	if (fmask_ptr)
-- 
2.17.1



More information about the mesa-dev mailing list