[Mesa-dev] [PATCH 3/5] radeonsi: use faster integer division for instance divisors
Marek Olšák
maraeo at gmail.com
Sun Sep 23 16:57:53 UTC 2018
From: Marek Olšák <marek.olsak at amd.com>
We know the divisors when we upload them, so instead we can precompute
and upload division factors derived from each divisor.
This fast division consists of add, mul_hi, and two shifts,
and we have to load 4 dwords intead of 1.
This probably won't affect any apps.
---
src/gallium/drivers/radeonsi/si_shader.c | 46 +++++++++++++++-----------------
src/gallium/drivers/radeonsi/si_state.c | 42 ++++++++++++++++++++++++-----
src/gallium/drivers/radeonsi/si_state.h | 2 +-
3 files changed, 57 insertions(+), 33 deletions(-)
diff --git a/src/gallium/drivers/radeonsi/si_shader.c b/src/gallium/drivers/radeonsi/si_shader.c
index 36f58e2..90cb059 100644
--- a/src/gallium/drivers/radeonsi/si_shader.c
+++ b/src/gallium/drivers/radeonsi/si_shader.c
@@ -421,34 +421,20 @@ static LLVMValueRef get_tcs_in_vertex_dw_stride(struct si_shader_context *ctx)
return LLVMConstInt(ctx->i32, stride, 0);
}
return si_unpack_param(ctx, ctx->param_vs_state_bits, 24, 8);
default:
assert(0);
return NULL;
}
}
-static LLVMValueRef get_instance_index_for_fetch(
- struct si_shader_context *ctx,
- unsigned param_start_instance, LLVMValueRef divisor)
-{
- LLVMValueRef result = ctx->abi.instance_id;
-
- /* The division must be done before START_INSTANCE is added. */
- if (divisor != ctx->i32_1)
- result = LLVMBuildUDiv(ctx->ac.builder, result, divisor, "");
-
- return LLVMBuildAdd(ctx->ac.builder, result,
- LLVMGetParam(ctx->main_fn, param_start_instance), "");
-}
-
/* Bitcast <4 x float> to <2 x double>, extract the component, and convert
* to float. */
static LLVMValueRef extract_double_to_float(struct si_shader_context *ctx,
LLVMValueRef vec4,
unsigned double_index)
{
LLVMBuilderRef builder = ctx->ac.builder;
LLVMTypeRef f64 = LLVMDoubleTypeInContext(ctx->ac.context);
LLVMValueRef dvec2 = LLVMBuildBitCast(builder, vec4,
LLVMVectorType(f64, 2), "");
@@ -7294,34 +7280,44 @@ static void si_build_vs_prolog_function(struct si_shader_context *ctx,
ac_build_load_to_sgpr(&ctx->ac, list, buf_index);
}
for (i = 0; i <= key->vs_prolog.last_input; i++) {
bool divisor_is_one =
key->vs_prolog.states.instance_divisor_is_one & (1u << i);
bool divisor_is_fetched =
key->vs_prolog.states.instance_divisor_is_fetched & (1u << i);
LLVMValueRef index;
- if (divisor_is_one || divisor_is_fetched) {
- LLVMValueRef divisor = ctx->i32_1;
+ if (divisor_is_one) {
+ index = ctx->abi.instance_id;
+ } else if (divisor_is_fetched) {
+ LLVMValueRef udiv_factors[4];
- if (divisor_is_fetched) {
- divisor = buffer_load_const(ctx, instance_divisor_constbuf,
- LLVMConstInt(ctx->i32, i * 4, 0));
- divisor = ac_to_integer(&ctx->ac, divisor);
+ for (unsigned j = 0; j < 4; j++) {
+ udiv_factors[j] =
+ buffer_load_const(ctx, instance_divisor_constbuf,
+ LLVMConstInt(ctx->i32, i*16 + j*4, 0));
+ udiv_factors[j] = ac_to_integer(&ctx->ac, udiv_factors[j]);
}
+ /* The faster NUW version doesn't work when InstanceID == UINT_MAX.
+ * Such InstanceID might not be achievable in a reasonable time though.
+ */
+ index = ac_build_fast_udiv_nuw(&ctx->ac, ctx->abi.instance_id,
+ udiv_factors[0], udiv_factors[1],
+ udiv_factors[2], udiv_factors[3]);
+ }
- /* InstanceID / Divisor + StartInstance */
- index = get_instance_index_for_fetch(ctx,
- user_sgpr_base +
- SI_SGPR_START_INSTANCE,
- divisor);
+ if (divisor_is_one || divisor_is_fetched) {
+ /* Add StartInstance. */
+ index = LLVMBuildAdd(ctx->ac.builder, index,
+ LLVMGetParam(ctx->main_fn, user_sgpr_base +
+ SI_SGPR_START_INSTANCE), "");
} else {
/* VertexID + BaseVertex */
index = LLVMBuildAdd(ctx->ac.builder,
ctx->abi.vertex_id,
LLVMGetParam(func, user_sgpr_base +
SI_SGPR_BASE_VERTEX), "");
}
index = ac_to_float(&ctx->ac, index);
ret = LLVMBuildInsertValue(ctx->ac.builder, ret, index,
diff --git a/src/gallium/drivers/radeonsi/si_state.c b/src/gallium/drivers/radeonsi/si_state.c
index bc1417a..aa57b3f 100644
--- a/src/gallium/drivers/radeonsi/si_state.c
+++ b/src/gallium/drivers/radeonsi/si_state.c
@@ -25,20 +25,21 @@
#include "si_build_pm4.h"
#include "gfx9d.h"
#include "si_query.h"
#include "util/u_dual_blend.h"
#include "util/u_format.h"
#include "util/u_format_s3tc.h"
#include "util/u_memory.h"
#include "util/u_resource.h"
#include "util/u_upload_mgr.h"
+#include "util/fast_idiv_by_const.h"
static unsigned si_map_swizzle(unsigned swizzle)
{
switch (swizzle) {
case PIPE_SWIZZLE_Y:
return V_008F0C_SQ_SEL_Y;
case PIPE_SWIZZLE_Z:
return V_008F0C_SQ_SEL_Z;
case PIPE_SWIZZLE_W:
return V_008F0C_SQ_SEL_W;
@@ -4348,20 +4349,26 @@ static void si_delete_sampler_state(struct pipe_context *ctx, void *state)
* Vertex elements & buffers
*/
static void *si_create_vertex_elements(struct pipe_context *ctx,
unsigned count,
const struct pipe_vertex_element *elements)
{
struct si_screen *sscreen = (struct si_screen*)ctx->screen;
struct si_vertex_elements *v = CALLOC_STRUCT(si_vertex_elements);
bool used[SI_NUM_VERTEX_BUFFERS] = {};
+ struct util_fast_udiv_info divisor_factors[SI_MAX_ATTRIBS] = {};
+ STATIC_ASSERT(sizeof(struct util_fast_udiv_info) == 16);
+ STATIC_ASSERT(sizeof(divisor_factors[0].multiplier) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].pre_shift) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].post_shift) == 4);
+ STATIC_ASSERT(sizeof(divisor_factors[0].increment) == 4);
int i;
assert(count <= SI_MAX_ATTRIBS);
if (!v)
return NULL;
v->count = count;
v->desc_list_byte_size = align(count * 16, SI_CPDMA_ALIGNMENT);
for (i = 0; i < count; ++i) {
@@ -4370,28 +4377,31 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
unsigned data_format, num_format;
int first_non_void;
unsigned vbo_index = elements[i].vertex_buffer_index;
unsigned char swizzle[4];
if (vbo_index >= SI_NUM_VERTEX_BUFFERS) {
FREE(v);
return NULL;
}
- if (elements[i].instance_divisor) {
+ unsigned instance_divisor = elements[i].instance_divisor;
+ if (instance_divisor) {
v->uses_instance_divisors = true;
- v->instance_divisors[i] = elements[i].instance_divisor;
- if (v->instance_divisors[i] == 1)
+ if (instance_divisor == 1) {
v->instance_divisor_is_one |= 1u << i;
- else
+ } else {
v->instance_divisor_is_fetched |= 1u << i;
+ divisor_factors[i] =
+ util_compute_fast_udiv_info(instance_divisor, 32);
+ }
}
if (!used[vbo_index]) {
v->first_vb_use_mask |= 1 << i;
used[vbo_index] = true;
}
desc = util_format_description(elements[i].src_format);
first_non_void = util_format_get_first_non_void_channel(elements[i].src_format);
data_format = si_translate_buffer_dataformat(ctx->screen, desc, first_non_void);
@@ -4487,20 +4497,36 @@ static void *si_create_vertex_elements(struct pipe_context *ctx,
}
}
v->rsrc_word3[i] = S_008F0C_DST_SEL_X(si_map_swizzle(swizzle[0])) |
S_008F0C_DST_SEL_Y(si_map_swizzle(swizzle[1])) |
S_008F0C_DST_SEL_Z(si_map_swizzle(swizzle[2])) |
S_008F0C_DST_SEL_W(si_map_swizzle(swizzle[3])) |
S_008F0C_NUM_FORMAT(num_format) |
S_008F0C_DATA_FORMAT(data_format);
}
+
+ if (v->instance_divisor_is_fetched) {
+ unsigned num_divisors = util_last_bit(v->instance_divisor_is_fetched);
+
+ v->instance_divisor_factor_buffer =
+ (struct r600_resource*)
+ pipe_buffer_create(&sscreen->b, 0, PIPE_USAGE_DEFAULT,
+ num_divisors * sizeof(divisor_factors[0]));
+ if (!v->instance_divisor_factor_buffer) {
+ FREE(v);
+ return NULL;
+ }
+ void *map = sscreen->ws->buffer_map(v->instance_divisor_factor_buffer->buf,
+ NULL, PIPE_TRANSFER_WRITE);
+ memcpy(map , divisor_factors, num_divisors * sizeof(divisor_factors[0]));
+ }
return v;
}
static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
struct si_vertex_elements *old = sctx->vertex_elements;
struct si_vertex_elements *v = (struct si_vertex_elements*)state;
sctx->vertex_elements = v;
@@ -4510,34 +4536,36 @@ static void si_bind_vertex_elements(struct pipe_context *ctx, void *state)
(!old ||
old->count != v->count ||
old->uses_instance_divisors != v->uses_instance_divisors ||
v->uses_instance_divisors || /* we don't check which divisors changed */
memcmp(old->fix_fetch, v->fix_fetch, sizeof(v->fix_fetch[0]) * v->count)))
sctx->do_update_shaders = true;
if (v && v->instance_divisor_is_fetched) {
struct pipe_constant_buffer cb;
- cb.buffer = NULL;
- cb.user_buffer = v->instance_divisors;
+ cb.buffer = &v->instance_divisor_factor_buffer->b.b;
+ cb.user_buffer = NULL;
cb.buffer_offset = 0;
- cb.buffer_size = sizeof(uint32_t) * v->count;
+ cb.buffer_size = 0xffffffff;
si_set_rw_buffer(sctx, SI_VS_CONST_INSTANCE_DIVISORS, &cb);
}
}
static void si_delete_vertex_element(struct pipe_context *ctx, void *state)
{
struct si_context *sctx = (struct si_context *)ctx;
+ struct si_vertex_elements *v = (struct si_vertex_elements*)state;
if (sctx->vertex_elements == state)
sctx->vertex_elements = NULL;
+ r600_resource_reference(&v->instance_divisor_factor_buffer, NULL);
FREE(state);
}
static void si_set_vertex_buffers(struct pipe_context *ctx,
unsigned start_slot, unsigned count,
const struct pipe_vertex_buffer *buffers)
{
struct si_context *sctx = (struct si_context *)ctx;
struct pipe_vertex_buffer *dst = sctx->vertex_buffer + start_slot;
int i;
diff --git a/src/gallium/drivers/radeonsi/si_state.h b/src/gallium/drivers/radeonsi/si_state.h
index 89bb5b6..d9c3e70 100644
--- a/src/gallium/drivers/radeonsi/si_state.h
+++ b/src/gallium/drivers/radeonsi/si_state.h
@@ -131,21 +131,21 @@ struct si_state_dsa {
};
struct si_stencil_ref {
struct pipe_stencil_ref state;
struct si_dsa_stencil_ref_part dsa_part;
};
struct si_vertex_elements
{
- uint32_t instance_divisors[SI_MAX_ATTRIBS];
+ struct r600_resource *instance_divisor_factor_buffer;
uint32_t rsrc_word3[SI_MAX_ATTRIBS];
uint16_t src_offset[SI_MAX_ATTRIBS];
uint8_t fix_fetch[SI_MAX_ATTRIBS];
uint8_t format_size[SI_MAX_ATTRIBS];
uint8_t vertex_buffer_index[SI_MAX_ATTRIBS];
uint8_t count;
bool uses_instance_divisors;
uint16_t first_vb_use_mask;
--
2.7.4
More information about the mesa-dev
mailing list