[Mesa-stable] [PATCH 07/23] radeonsi: do not kill GS with memory writes

Wed Nov 30 13:45:48 UTC 2016

On 30.11.2016 14:35, Nicolai Hähnle wrote:
> From: Nicolai Hähnle <nicolai.haehnle at amd.com>
>
> Vertex emits beyond the specified maximum number of vertices are supposed to
> have no effect, which is why we used to always kill GS that reached the limit.
>
> However, if the GS also writes to memory (SSBO, atomics, shader images), then
> we must keep going and only skip the vertex emit itself.
>
> Cc: mesa-stable at lists.freedesktop.org
> ---
>  src/gallium/drivers/radeonsi/si_shader.c | 30 ++++++++++++++++++++++--------
>  1 file changed, 22 insertions(+), 8 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index 1e3be62..aac3091 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -5116,50 +5116,60 @@ static unsigned si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
>  static void si_llvm_emit_vertex(
>  	const struct lp_build_tgsi_action *action,
>  	struct lp_build_tgsi_context *bld_base,
>  	struct lp_build_emit_data *emit_data)
>  {
>  	struct si_shader_context *ctx = si_shader_context(bld_base);
>  	struct lp_build_context *uint = &bld_base->uint_bld;
>  	struct si_shader *shader = ctx->shader;
>  	struct tgsi_shader_info *info = &shader->selector->info;
>  	struct gallivm_state *gallivm = bld_base->base.gallivm;
> +	struct lp_build_if_state if_state;
>  	LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
>  					    SI_PARAM_GS2VS_OFFSET);
>  	LLVMValueRef gs_next_vertex;
>  	LLVMValueRef can_emit, kill;
>  	LLVMValueRef args[2];
>  	unsigned chan;
>  	int i;
>  	unsigned stream;
>
>  	stream = si_llvm_get_stream(bld_base, emit_data);
>
>  	/* Write vertex attribute values to GSVS ring */
>  	gs_next_vertex = LLVMBuildLoad(gallivm->builder,
>  				       ctx->gs_next_vertex[stream],
>  				       "");
>
>  	/* If this thread has already emitted the declared maximum number of
> -	 * vertices, kill it: excessive vertex emissions are not supposed to
> -	 * have any effect, and GS threads have no externally observable
> -	 * effects other than emitting vertices.
> +	 * vertices, skip the write: excessive vertex emissions are not
> +	 * supposed to have any effect.
> +	 *
> +	 * If the shader has no writes to memory, kill it instead. This skips
> +	 * further memory loads and may allow LLVM to skip to the end
> +	 * altogether.
>  	 */
>  	can_emit = LLVMBuildICmp(gallivm->builder, LLVMIntULE, gs_next_vertex,
>  				 lp_build_const_int32(gallivm,
>  						      shader->selector->gs_max_out_vertices), "");
> -	kill = lp_build_select(&bld_base->base, can_emit,
> -			       lp_build_const_float(gallivm, 1.0f),
> -			       lp_build_const_float(gallivm, -1.0f));
>
> -	lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
> -			   ctx->voidt, &kill, 1, 0);
> +	bool use_kill = !info->writes_memory;

I think there's a && max_streams == 0 missing here.

I'm pretty sure the intention is that the max_vertices is per-stream, 
but reading through the GLSL docs again, it kind of sounds like at least 
the spec language mandates it to be per-invocation (i.e. sum across 
streams).

In any case, what we do here is inconsistent with either interpretation, 
but I'm also not aware of any test case (yet?) which would vote one way 
or the other.

Any ideas?

Thanks,
Nicolai

> +	if (use_kill) {
> +		kill = lp_build_select(&bld_base->base, can_emit,
> +				       lp_build_const_float(gallivm, 1.0f),
> +				       lp_build_const_float(gallivm, -1.0f));
> +
> +		lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
> +				   ctx->voidt, &kill, 1, 0);
> +	} else {
> +		lp_build_if(&if_state, gallivm, can_emit);
> +	}
>
>  	for (i = 0; i < info->num_outputs; i++) {
>  		LLVMValueRef *out_ptr =
>  			ctx->soa.outputs[i];
>
>  		for (chan = 0; chan < 4; chan++) {
>  			LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
>  			LLVMValueRef voffset =
>  				lp_build_const_int32(gallivm, (i * 4 + chan) *
>  						     shader->selector->gs_max_out_vertices);
> @@ -5171,30 +5181,34 @@ static void si_llvm_emit_vertex(
>
>  			build_tbuffer_store(ctx,
>  					    ctx->gsvs_ring[stream],
>  					    out_val, 1,
>  					    voffset, soffset, 0,
>  					    V_008F0C_BUF_DATA_FORMAT_32,
>  					    V_008F0C_BUF_NUM_FORMAT_UINT,
>  					    1, 0, 1, 1, 0);
>  		}
>  	}
> +
>  	gs_next_vertex = lp_build_add(uint, gs_next_vertex,
>  				      lp_build_const_int32(gallivm, 1));
>
>  	LLVMBuildStore(gallivm->builder, gs_next_vertex, ctx->gs_next_vertex[stream]);
>
>  	/* Signal vertex emission */
>  	args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS | (stream << 8));
>  	args[1] = LLVMGetParam(ctx->main_fn, SI_PARAM_GS_WAVE_ID);
>  	lp_build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
>  			   ctx->voidt, args, 2, 0);
> +
> +	if (!use_kill)
> +		lp_build_endif(&if_state);
>  }
>
>  /* Cut one primitive from the geometry shader */
>  static void si_llvm_emit_primitive(
>  	const struct lp_build_tgsi_action *action,
>  	struct lp_build_tgsi_context *bld_base,
>  	struct lp_build_emit_data *emit_data)
>  {
>  	struct si_shader_context *ctx = si_shader_context(bld_base);
>  	struct gallivm_state *gallivm = bld_base->base.gallivm;
>