[Mesa-dev] [PATCH 3/4] radeonsi: add GS multiple streams support (v1.1)
Marek Olšák
maraeo at gmail.com
Wed Jul 29 14:31:28 PDT 2015
On Wed, Jul 29, 2015 at 1:20 AM, Dave Airlie <airlied at gmail.com> wrote:
> From: Dave Airlie <airlied at redhat.com>
>
> This is the final piece for ARB_gpu_shader5,
>
> The code is based on the r600 code from Glenn Kennard,
> and myself.
>
> While developing this, I'm not 100% sure of all the calculations
> made in the GS registers, this is why the max_stream is worked
> out there and used to limit the changes in registers. Otherwise
> my initial attempts either regressed GS texelFetch tests
> or primitive-id-restart. The current code has no regressions
> in piglit.
>
> This commit doesn't enable ARB_gpu_shader5, since that just
> bumps the glsl level to 4.00, so I'll just do a separate patch
> for 4.10.
>
> v1.1: fix bug introduced in rebase.
>
> Signed-off-by: Dave Airlie <airlied at redhat.com>
> ---
> src/gallium/drivers/radeonsi/si_descriptors.c | 4 +-
> src/gallium/drivers/radeonsi/si_pipe.c | 2 +-
> src/gallium/drivers/radeonsi/si_shader.c | 59 ++++++++++++++++---
> src/gallium/drivers/radeonsi/si_state.c | 4 --
> src/gallium/drivers/radeonsi/si_state.h | 8 ++-
> src/gallium/drivers/radeonsi/si_state_shaders.c | 75 +++++++++++++++++++------
> 6 files changed, 120 insertions(+), 32 deletions(-)
>
> diff --git a/src/gallium/drivers/radeonsi/si_descriptors.c b/src/gallium/drivers/radeonsi/si_descriptors.c
> index 2e2a35b..14bb6e1 100644
> --- a/src/gallium/drivers/radeonsi/si_descriptors.c
> +++ b/src/gallium/drivers/radeonsi/si_descriptors.c
> @@ -724,7 +724,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
> struct pipe_resource *buffer,
> unsigned stride, unsigned num_records,
> bool add_tid, bool swizzle,
> - unsigned element_size, unsigned index_stride)
> + unsigned element_size, unsigned index_stride, uint64_t offset)
> {
> struct si_context *sctx = (struct si_context *)ctx;
> struct si_buffer_resources *buffers = &sctx->rw_buffers[shader];
> @@ -741,7 +741,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
> if (buffer) {
> uint64_t va;
>
> - va = r600_resource(buffer)->gpu_address;
> + va = r600_resource(buffer)->gpu_address + offset;
>
> switch (element_size) {
> default:
> diff --git a/src/gallium/drivers/radeonsi/si_pipe.c b/src/gallium/drivers/radeonsi/si_pipe.c
> index 808b9bc..a120282 100644
> --- a/src/gallium/drivers/radeonsi/si_pipe.c
> +++ b/src/gallium/drivers/radeonsi/si_pipe.c
> @@ -316,7 +316,7 @@ static int si_get_param(struct pipe_screen* pscreen, enum pipe_cap param)
> case PIPE_CAP_MAX_GEOMETRY_TOTAL_OUTPUT_COMPONENTS:
> return 4095;
> case PIPE_CAP_MAX_VERTEX_STREAMS:
> - return 1;
> + return 4;
>
> case PIPE_CAP_MAX_VERTEX_ATTRIB_STRIDE:
> return 2048;
> diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
> index fa31f73..b472fa6 100644
> --- a/src/gallium/drivers/radeonsi/si_shader.c
> +++ b/src/gallium/drivers/radeonsi/si_shader.c
> @@ -31,6 +31,7 @@
> #include "gallivm/lp_bld_intr.h"
> #include "gallivm/lp_bld_logic.h"
> #include "gallivm/lp_bld_arit.h"
> +#include "gallivm/lp_bld_bitarit.h"
> #include "gallivm/lp_bld_flow.h"
> #include "radeon/r600_cs.h"
> #include "radeon/radeon_llvm.h"
> @@ -1576,6 +1577,8 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
> LLVMValueRef can_emit =
> LLVMBuildICmp(builder, LLVMIntULT, tid, so_vtx_count, "");
>
> + LLVMValueRef stream_id =
> + unpack_param(shader, shader->param_streamout_config, 24, 2);
Wrong indentation and missing an empty line before the following comment.
> /* Emit the streamout code conditionally. This actually avoids
> * out-of-bounds buffer access. The hw tells us via the SGPR
> * (so_vtx_count) which threads are allowed to emit streamout data. */
> @@ -1615,8 +1618,9 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
> unsigned reg = so->output[i].register_index;
> unsigned start = so->output[i].start_component;
> unsigned num_comps = so->output[i].num_components;
> + unsigned stream = so->output[i].stream;
> LLVMValueRef out[4];
> -
> + struct lp_build_if_state if_ctx_stream;
There should be an empty line after the declaration.
> assert(num_comps && num_comps <= 4);
> if (!num_comps || num_comps > 4)
> continue;
> @@ -1649,11 +1653,15 @@ static void si_llvm_emit_streamout(struct si_shader_context *shader,
> break;
> }
>
> + LLVMValueRef can_emit_stream =
> + LLVMBuildICmp(builder, LLVMIntEQ, stream_id, lp_build_const_int32(gallivm, stream), "");
Wrong indentation.
> + lp_build_if(&if_ctx_stream, gallivm, can_emit_stream);
> build_tbuffer_store_dwords(shader, shader->so_buffers[buf_idx],
> vdata, num_comps,
> so_write_offset[buf_idx],
> LLVMConstInt(i32, 0, 0),
> so->output[i].dst_offset*4);
> + lp_build_endif(&if_ctx_stream);
> }
> }
> lp_build_endif(&if_ctx);
> @@ -3188,6 +3196,22 @@ static void build_interp_intrinsic(const struct lp_build_tgsi_action *action,
> }
> }
>
> +static LLVMValueRef si_llvm_get_stream(struct lp_build_tgsi_context *bld_base,
> + struct lp_build_emit_data *emit_data)
> +{
> + struct lp_build_context *uint = &bld_base->uint_bld;
> + struct gallivm_state *gallivm = bld_base->base.gallivm;
> + LLVMValueRef (*imms)[4] = lp_soa_context(bld_base)->immediates;
> + LLVMValueRef stream;
> + struct tgsi_src_register src0 = emit_data->inst->Src[0].Register;
> +
> + assert(src0.File == TGSI_FILE_IMMEDIATE);
> +
> + stream = imms[src0.Index][src0.SwizzleX];
> + stream = lp_build_and(uint, stream, lp_build_const_int32(gallivm, 3));
> + return stream;
You can use LLVMConstIntGetZExtValue to evaluate the constant
LLVMValueRef and return unsigned. You can use that result as the index
into sctx->gsvs_ring, which can be an array of 4 descriptors.
With that, you don't have to use build_indexed_load_const in
si_llvm_emit_vertex.
gs_next_vertex can also be an array of 4 LLVMValueRef variables, which
will eliminate the need to allocate the array in the IR and use
LLVMBuildInsert/ExtractElement functions.
> +}
> +
> /* Emit one vertex from the geometry shader */
> static void si_llvm_emit_vertex(
> const struct lp_build_tgsi_action *action,
> @@ -3202,14 +3226,21 @@ static void si_llvm_emit_vertex(
> LLVMTypeRef i32 = LLVMInt32TypeInContext(gallivm->context);
> LLVMValueRef soffset = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
> SI_PARAM_GS2VS_OFFSET);
> + LLVMValueRef buf_ptr = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn,
> + SI_PARAM_RW_BUFFERS);
> + LLVMValueRef gs_next_vertex_array;
> LLVMValueRef gs_next_vertex;
> LLVMValueRef can_emit, kill;
> LLVMValueRef args[2];
> unsigned chan;
> int i;
> + LLVMValueRef stream;
> + LLVMValueRef gsvs_ring;
>
> + stream = si_llvm_get_stream(bld_base, emit_data);
> /* Write vertex attribute values to GSVS ring */
> - gs_next_vertex = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
> + gs_next_vertex_array = LLVMBuildLoad(gallivm->builder, si_shader_ctx->gs_next_vertex, "");
> + gs_next_vertex = LLVMBuildExtractElement(gallivm->builder, gs_next_vertex_array, stream, "");
>
> /* If this thread has already emitted the declared maximum number of
> * vertices, kill it: excessive vertex emissions are not supposed to
> @@ -3225,6 +3256,9 @@ static void si_llvm_emit_vertex(
> build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
> LLVMVoidTypeInContext(gallivm->context), &kill, 1, 0);
>
> + gsvs_ring = build_indexed_load_const(si_shader_ctx, buf_ptr,
> + lp_build_add(uint, stream, lp_build_const_int32(gallivm, 1)));
> +
> for (i = 0; i < info->num_outputs; i++) {
> LLVMValueRef *out_ptr =
> si_shader_ctx->radeon_bld.soa.outputs[i];
> @@ -3241,7 +3275,7 @@ static void si_llvm_emit_vertex(
> out_val = LLVMBuildBitCast(gallivm->builder, out_val, i32, "");
>
> build_tbuffer_store(si_shader_ctx,
> - si_shader_ctx->gsvs_ring,
> + gsvs_ring,
> out_val, 1,
> voffset, soffset, 0,
> V_008F0C_BUF_DATA_FORMAT_32,
> @@ -3251,10 +3285,16 @@ static void si_llvm_emit_vertex(
> }
> gs_next_vertex = lp_build_add(uint, gs_next_vertex,
> lp_build_const_int32(gallivm, 1));
> - LLVMBuildStore(gallivm->builder, gs_next_vertex, si_shader_ctx->gs_next_vertex);
> + gs_next_vertex_array = LLVMBuildInsertElement(gallivm->builder, gs_next_vertex_array, gs_next_vertex,
> + stream, "");
> + LLVMBuildStore(gallivm->builder, gs_next_vertex_array, si_shader_ctx->gs_next_vertex);
> +
> + /* shift stream value for or'ing */
> + stream = lp_build_shl_imm(uint, stream, 8);
>
> /* Signal vertex emission */
> args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_EMIT | SENDMSG_GS);
> + args[0] = lp_build_or(uint, args[0], stream);
> args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
> build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
> LLVMVoidTypeInContext(gallivm->context), args, 2,
> @@ -3269,10 +3309,15 @@ static void si_llvm_emit_primitive(
> {
> struct si_shader_context *si_shader_ctx = si_shader_context(bld_base);
> struct gallivm_state *gallivm = bld_base->base.gallivm;
> + struct lp_build_context *uint = &bld_base->uint_bld;
> LLVMValueRef args[2];
> + LLVMValueRef stream;
>
> /* Signal primitive cut */
> + stream = si_llvm_get_stream(bld_base, emit_data);
> + stream = lp_build_shl_imm(uint, stream, 8);
> args[0] = lp_build_const_int32(gallivm, SENDMSG_GS_OP_CUT | SENDMSG_GS);
> + args[0] = lp_build_or(uint, args[0], stream);
> args[1] = LLVMGetParam(si_shader_ctx->radeon_bld.main_fn, SI_PARAM_GS_WAVE_ID);
> build_intrinsic(gallivm->builder, "llvm.SI.sendmsg",
> LLVMVoidTypeInContext(gallivm->context), args, 2,
> @@ -3651,8 +3696,7 @@ static void preload_ring_buffers(struct si_shader_context *si_shader_ctx)
> build_indexed_load_const(si_shader_ctx, buf_ptr, offset);
> }
>
> - if (si_shader_ctx->type == TGSI_PROCESSOR_GEOMETRY ||
> - si_shader_ctx->shader->is_gs_copy_shader) {
> + if (si_shader_ctx->shader->is_gs_copy_shader) {
> LLVMValueRef offset = lp_build_const_int32(gallivm, SI_RING_GSVS);
>
> si_shader_ctx->gsvs_ring =
> @@ -4076,9 +4120,10 @@ int si_shader_create(struct si_screen *sscreen, LLVMTargetMachineRef tm,
> preload_ring_buffers(&si_shader_ctx);
>
> if (si_shader_ctx.type == TGSI_PROCESSOR_GEOMETRY) {
> + /* create a 4xuint32 */
> si_shader_ctx.gs_next_vertex =
> lp_build_alloca(bld_base->base.gallivm,
> - bld_base->uint_bld.elem_type, "");
> + lp_build_int_vec_type(bld_base->base.gallivm, lp_type_uint_vec(32, 32*4)), "");
> }
>
> if (!lp_build_tgsi_llvm(bld_base, tokens)) {
> diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
> index ab5c3ca..86e1624 100644
> --- a/src/gallium/drivers/radeonsi/si_state.c
> +++ b/src/gallium/drivers/radeonsi/si_state.c
> @@ -3138,10 +3138,6 @@ static void si_init_config(struct si_context *sctx)
> si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0);
> si_pm4_set_reg(pm4, R_028B28_VGT_STRMOUT_DRAW_OPAQUE_OFFSET, 0);
>
> - si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, 0);
> - si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, 0);
> - si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, 0);
> -
> si_pm4_set_reg(pm4, R_028B98_VGT_STRMOUT_BUFFER_CONFIG, 0x0);
> si_pm4_set_reg(pm4, R_028AB4_VGT_REUSE_OFF, 0);
> si_pm4_set_reg(pm4, R_028AB8_VGT_VTX_CNT_EN, 0x0);
> diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
> index 2522053..132ddda 100644
> --- a/src/gallium/drivers/radeonsi/si_state.h
> +++ b/src/gallium/drivers/radeonsi/si_state.h
> @@ -147,8 +147,12 @@ struct si_shader_data {
> */
> #define SI_RING_TESS_FACTOR 0 /* for HS (TCS) */
> #define SI_RING_ESGS 0 /* for ES, GS */
> +#define SI_RING_ESGS 0
This definition is redundant.
> #define SI_RING_GSVS 1 /* for GS, VS */
> -#define SI_NUM_RING_BUFFERS 2
> +#define SI_RING_GSVS_1 2 /* 1, 2, 3 for GS */
> +#define SI_RING_GSVS_2 3
> +#define SI_RING_GSVS_3 4
> +#define SI_NUM_RING_BUFFERS 5
> #define SI_SO_BUF_OFFSET SI_NUM_RING_BUFFERS
> #define SI_NUM_RW_BUFFERS (SI_SO_BUF_OFFSET + 4)
>
> @@ -249,7 +253,7 @@ void si_set_ring_buffer(struct pipe_context *ctx, uint shader, uint slot,
> struct pipe_resource *buffer,
> unsigned stride, unsigned num_records,
> bool add_tid, bool swizzle,
> - unsigned element_size, unsigned index_stride);
> + unsigned element_size, unsigned index_stride, uint64_t offset);
> void si_init_all_descriptors(struct si_context *sctx);
> void si_release_all_descriptors(struct si_context *sctx);
> void si_all_descriptors_begin_new_cs(struct si_context *sctx);
> diff --git a/src/gallium/drivers/radeonsi/si_state_shaders.c b/src/gallium/drivers/radeonsi/si_state_shaders.c
> index 18bddfd..daf41f3 100644
> --- a/src/gallium/drivers/radeonsi/si_state_shaders.c
> +++ b/src/gallium/drivers/radeonsi/si_state_shaders.c
> @@ -206,16 +206,32 @@ static void si_shader_es(struct si_shader *shader)
> si_set_tesseval_regs(shader, pm4);
> }
>
> +static unsigned si_gs_get_max_stream(struct si_shader *shader)
> +{
> + struct pipe_stream_output_info *so = &shader->selector->so;
> + unsigned max_stream, i;
Empty line after the declaration.
> + if (so->num_outputs == 0)
> + return 0;
> +
> + max_stream = 0;
> + for (i = 0; i < so->num_outputs; i++) {
> + if (so->output[i].stream > max_stream)
> + max_stream = so->output[i].stream;
> + }
> + return max_stream;
> +}
> +
> static void si_shader_gs(struct si_shader *shader)
> {
> - unsigned gs_vert_itemsize = shader->selector->info.num_outputs * (16 >> 2);
> + unsigned gs_vert_itemsize = shader->selector->info.num_outputs * 16;
> unsigned gs_max_vert_out = shader->selector->gs_max_out_vertices;
> - unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
> + unsigned gsvs_itemsize = (gs_vert_itemsize * gs_max_vert_out) >> 2;
> unsigned gs_num_invocations = shader->selector->gs_num_invocations;
> unsigned cut_mode;
> struct si_pm4_state *pm4;
> unsigned num_sgprs, num_user_sgprs;
> uint64_t va;
> + unsigned max_stream = si_gs_get_max_stream(shader);
>
> /* The GSVS_RING_ITEMSIZE register takes 15 bits */
> assert(gsvs_itemsize < (1 << 15));
> @@ -243,16 +259,19 @@ static void si_shader_gs(struct si_shader *shader)
> S_028A40_GS_WRITE_OPTIMIZE(1));
>
> si_pm4_set_reg(pm4, R_028A60_VGT_GSVS_RING_OFFSET_1, gsvs_itemsize);
> - si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize);
> - si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize);
> + si_pm4_set_reg(pm4, R_028A64_VGT_GSVS_RING_OFFSET_2, gsvs_itemsize * ((max_stream >= 2) ? 2 : 1));
> + si_pm4_set_reg(pm4, R_028A68_VGT_GSVS_RING_OFFSET_3, gsvs_itemsize * ((max_stream >= 3) ? 3 : 1));
>
> si_pm4_set_reg(pm4, R_028AAC_VGT_ESGS_RING_ITEMSIZE,
> util_bitcount64(shader->selector->inputs_read) * (16 >> 2));
> - si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize);
> + si_pm4_set_reg(pm4, R_028AB0_VGT_GSVS_RING_ITEMSIZE, gsvs_itemsize * (max_stream + 1));
>
> si_pm4_set_reg(pm4, R_028B38_VGT_GS_MAX_VERT_OUT, gs_max_vert_out);
>
> - si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize);
> + si_pm4_set_reg(pm4, R_028B5C_VGT_GS_VERT_ITEMSIZE, gs_vert_itemsize >> 2);
> + si_pm4_set_reg(pm4, R_028B60_VGT_GS_VERT_ITEMSIZE_1, (max_stream >= 1) ? gs_vert_itemsize >> 2 : 0);
> + si_pm4_set_reg(pm4, R_028B64_VGT_GS_VERT_ITEMSIZE_2, (max_stream >= 2) ? gs_vert_itemsize >> 2 : 0);
> + si_pm4_set_reg(pm4, R_028B68_VGT_GS_VERT_ITEMSIZE_3, (max_stream >= 3) ? gs_vert_itemsize >> 2 : 0);
>
> si_pm4_set_reg(pm4, R_028B90_VGT_GS_INSTANCE_CNT,
> S_028B90_CNT(MIN2(gs_num_invocations, 127)) |
> @@ -1001,15 +1020,42 @@ static void si_init_gs_rings(struct si_context *sctx)
>
> si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_ESGS,
> sctx->esgs_ring, 0, esgs_ring_size,
> - true, true, 4, 64);
> + true, true, 4, 64, 0);
> si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_ESGS,
> sctx->esgs_ring, 0, esgs_ring_size,
> - false, false, 0, 0);
> + false, false, 0, 0, 0);
> si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_VERTEX, SI_RING_GSVS,
> sctx->gsvs_ring, 0, gsvs_ring_size,
> - false, false, 0, 0);
> + false, false, 0, 0, 0);
> }
>
> +static void si_update_gs_rings(struct si_context *sctx)
> +{
> + unsigned gs_vert_itemsize = sctx->gs_shader->info.num_outputs * 16;
> + unsigned gs_max_vert_out = sctx->gs_shader->gs_max_out_vertices;
> + unsigned gsvs_itemsize = gs_vert_itemsize * gs_max_vert_out;
> + uint64_t offset;
> +
> + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS,
> + sctx->gsvs_ring, gsvs_itemsize,
> + 64, true, true, 4, 16, 0);
> +
> + offset = gsvs_itemsize * 64;
> + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_1,
> + sctx->gsvs_ring, gsvs_itemsize,
> + 64, true, true, 4, 16, offset);
> +
> + offset = (gsvs_itemsize * 2) * 64;
> + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_2,
> + sctx->gsvs_ring, gsvs_itemsize,
> + 64, true, true, 4, 16, offset);
> +
> + offset = (gsvs_itemsize * 3) * 64;
> + si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_GEOMETRY, SI_RING_GSVS_3,
> + sctx->gsvs_ring, gsvs_itemsize,
> + 64, true, true, 4, 16, offset);
> +
> +}
> /**
> * @returns 1 if \p sel has been updated to use a new scratch buffer and 0
> * otherwise.
> @@ -1171,7 +1217,7 @@ static void si_init_tess_factor_ring(struct si_context *sctx)
>
> si_set_ring_buffer(&sctx->b.b, PIPE_SHADER_TESS_CTRL,
> SI_RING_TESS_FACTOR, sctx->tf_ring, 0,
> - sctx->tf_ring->width0, false, false, 0, 0);
> + sctx->tf_ring->width0, false, false, 0, 0, 0);
>
> sctx->b.flags |= SI_CONTEXT_VGT_FLUSH;
> }
> @@ -1252,7 +1298,7 @@ static void si_update_so(struct si_context *sctx, struct si_shader_selector *sha
> int i;
>
> for (i = 0; i < so->num_outputs; i++)
> - enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer);
> + enabled_stream_buffers_mask |= (1 << so->output[i].output_buffer) << so->output[i].stream * 4;
Missing parentheses? (so->output[i].stream * 4)
Marek
More information about the mesa-dev
mailing list