[Mesa-dev] [PATCH v2 22/25] radeonsi: pack GS output components for each vertex stream contiguously
Nicolai Hähnle
nhaehnle at gmail.com
Tue Dec 6 10:48:33 UTC 2016
From: Nicolai Hähnle <nicolai.haehnle at amd.com>
Note that the memory layout of one vertex stream inside one "item" (= memory
written by one GS wave) on the GSVS ring is:
t0v0c0 ... t15v0c0 t0v1c0 ... t15v1c0 ... t0vLc0 ... t15vLc0
t0v0c1 ... t15v0c1 t0v1c1 ... t15v1c1 ... t0vLc1 ... t15vLc1
...
t0v0cL ... t15v0cL t0v1cL ... t15v1cL ... t0vLcL ... t15vLcL
t16v0c0 ... t31v0c0 t16v1c0 ... t31v1c0 ... t16vLc0 ... t31vLc0
t16v0c1 ... t31v0c1 t16v1c1 ... t31v1c1 ... t16vLc1 ... t31vLc1
...
t16v0cL ... t31v0cL t16v1cL ... t31v1cL ... t16vLcL ... t31vLcL
...
t48v0c0 ... t63v0c0 t48v1c0 ... t63v1c0 ... t48vLc0 ... t63vLc0
t48v0c1 ... t63v0c1 t48v1c1 ... t63v1c1 ... t48vLc1 ... t63vLc1
...
t48v0cL ... t63v0cL t48v1cL ... t63v1cL ... t48vLcL ... t63vLcL
where tNN indicates the thread number, vNN the vertex number (in the order of
EMIT_VERTEX), and cNN the output component (vL and cL are the last vertex and
component, respectively).
The vertex streams are laid out sequentially.
The swizzling by 16 threads is hard-coded in the way the VGT generates the
offset passed into the GS copy shader, and the jump every 16 threads is
calculated from VGT_GSVS_RING_OFFSET_n and VGT_GSVS_RING_ITEMSIZE in a way
that makes it difficult to deviate from this layout (at least that's what
I've experimentally confirmed on VI after first trying to go the simpler
route of just interleaving the vertex streams).
---
src/gallium/drivers/radeonsi/si_shader.c | 11 ++++++++---
1 file changed, 8 insertions(+), 3 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index cd109e1..e3ed0d3 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -5150,21 +5150,21 @@ static void si_llvm_emit_vertex(
struct lp_build_context *uint = &bld_base->uint_bld;
struct si_shader *shader = ctx->shader;
struct tgsi_shader_info *info = &shader->selector->info;
struct gallivm_state *gallivm = bld_base->base.gallivm;
struct lp_build_if_state if_state;
LLVMValueRef soffset = LLVMGetParam(ctx->main_fn,
SI_PARAM_GS2VS_OFFSET);
LLVMValueRef gs_next_vertex;
LLVMValueRef can_emit, kill;
LLVMValueRef args[2];
- unsigned chan;
+ unsigned chan, offset;
int i;
unsigned stream;
stream = si_llvm_get_stream(bld_base, emit_data);
/* Write vertex attribute values to GSVS ring */
gs_next_vertex = LLVMBuildLoad(gallivm->builder,
ctx->gs_next_vertex[stream],
"");
@@ -5185,33 +5185,35 @@ static void si_llvm_emit_vertex(
kill = lp_build_select(&bld_base->base, can_emit,
lp_build_const_float(gallivm, 1.0f),
lp_build_const_float(gallivm, -1.0f));
lp_build_intrinsic(gallivm->builder, "llvm.AMDGPU.kill",
ctx->voidt, &kill, 1, 0);
} else {
lp_build_if(&if_state, gallivm, can_emit);
}
+ offset = 0;
for (i = 0; i < info->num_outputs; i++) {
LLVMValueRef *out_ptr =
ctx->soa.outputs[i];
for (chan = 0; chan < 4; chan++) {
if (!(info->output_usagemask[i] & (1 << chan)) ||
((info->output_streams[i] >> (2 * chan)) & 3) != stream)
continue;
LLVMValueRef out_val = LLVMBuildLoad(gallivm->builder, out_ptr[chan], "");
LLVMValueRef voffset =
- lp_build_const_int32(gallivm, (i * 4 + chan) *
+ lp_build_const_int32(gallivm, offset *
shader->selector->gs_max_out_vertices);
+ offset++;
voffset = lp_build_add(uint, voffset, gs_next_vertex);
voffset = lp_build_mul_imm(uint, voffset, 4);
out_val = LLVMBuildBitCast(gallivm->builder, out_val, ctx->i32, "");
build_tbuffer_store(ctx,
ctx->gsvs_ring[stream],
out_val, 1,
voffset, soffset, 0,
@@ -6285,43 +6287,46 @@ si_generate_gs_copy_shader(struct si_screen *sscreen,
}
LLVMBasicBlockRef end_bb;
LLVMValueRef switch_inst;
end_bb = LLVMAppendBasicBlockInContext(gallivm->context, ctx.main_fn, "end");
switch_inst = LLVMBuildSwitch(builder, stream_id, end_bb, 4);
for (int stream = 0; stream < 4; stream++) {
LLVMBasicBlockRef bb;
+ unsigned offset;
if (!gsinfo->num_stream_output_components[stream])
continue;
if (stream > 0 && !gs_selector->so.num_outputs)
continue;
bb = LLVMInsertBasicBlockInContext(gallivm->context, end_bb, "out");
LLVMAddCase(switch_inst, lp_build_const_int32(gallivm, stream), bb);
LLVMPositionBuilderAtEnd(builder, bb);
/* Fetch vertex data from GSVS ring */
+ offset = 0;
for (i = 0; i < gsinfo->num_outputs; ++i) {
for (unsigned chan = 0; chan < 4; chan++) {
if (!(gsinfo->output_usagemask[i] & (1 << chan)) ||
outputs[i].vertex_stream[chan] != stream) {
outputs[i].values[chan] = ctx.soa.bld_base.base.undef;
continue;
}
args[2] = lp_build_const_int32(
gallivm,
- (i * 4 + chan) * gs_selector->gs_max_out_vertices * 16 * 4);
+ offset * gs_selector->gs_max_out_vertices * 16 * 4);
+ offset++;
outputs[i].values[chan] =
LLVMBuildBitCast(gallivm->builder,
lp_build_intrinsic(gallivm->builder,
"llvm.SI.buffer.load.dword.i32.i32",
ctx.i32, args, 9,
LP_FUNC_ATTR_READONLY),
ctx.f32, "");
}
}
--
2.7.4
More information about the mesa-dev
mailing list