Mesa (master): ac/nir: implement sparse image/texture loads
GitLab Mirror
gitlab-mirror at kemper.freedesktop.org
Fri Jan 8 14:43:26 UTC 2021
Module: Mesa
Branch: master
Commit: 6d5e26752c664c7095ed0e7693591be797066110
URL: http://cgit.freedesktop.org/mesa/mesa/commit/?id=6d5e26752c664c7095ed0e7693591be797066110
Author: Rhys Perry <pendingchaos02 at gmail.com>
Date: Mon Nov 23 15:02:28 2020 +0000
ac/nir: implement sparse image/texture loads
Signed-off-by: Rhys Perry <pendingchaos02 at gmail.com>
Reviewed-by: Daniel Schürmann <daniel at schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/7775>
---
src/amd/llvm/ac_llvm_build.c | 96 +++++++++++++++++++---
src/amd/llvm/ac_llvm_build.h | 8 +-
src/amd/llvm/ac_nir_to_llvm.c | 40 +++++++--
.../drivers/radeonsi/si_compute_prim_discard.c | 2 +-
src/gallium/drivers/radeonsi/si_shader_llvm_vs.c | 2 +-
5 files changed, 128 insertions(+), 20 deletions(-)
diff --git a/src/amd/llvm/ac_llvm_build.c b/src/amd/llvm/ac_llvm_build.c
index 93d408bcfb1..4d8c5da4d5e 100644
--- a/src/amd/llvm/ac_llvm_build.c
+++ b/src/amd/llvm/ac_llvm_build.c
@@ -325,8 +325,27 @@ void ac_build_type_name_for_intr(LLVMTypeRef type, char *buf, unsigned bufsize)
{
LLVMTypeRef elem_type = type;
- assert(bufsize >= 8);
+ if (LLVMGetTypeKind(type) == LLVMStructTypeKind) {
+ unsigned count = LLVMCountStructElementTypes(type);
+ int ret = snprintf(buf, bufsize, "sl_");
+ buf += ret;
+ bufsize -= ret;
+
+ LLVMTypeRef *elems = alloca(count * sizeof(LLVMTypeRef));
+ LLVMGetStructElementTypes(type, elems);
+ for (unsigned i = 0; i < count; i++) {
+ ac_build_type_name_for_intr(elems[i], buf, bufsize);
+ ret = strlen(buf);
+ buf += ret;
+ bufsize -= ret;
+ }
+
+ snprintf(buf, bufsize, "s");
+ return;
+ }
+
+ assert(bufsize >= 8);
if (LLVMGetTypeKind(type) == LLVMVectorTypeKind) {
int ret = snprintf(buf, bufsize, "v%u", LLVMGetVectorSize(type));
if (ret < 0) {
@@ -566,11 +585,25 @@ LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *v
return ac_build_gather_values_extended(ctx, values, value_count, 1, false, false);
}
+LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b)
+{
+ unsigned a_size = ac_get_llvm_num_components(a);
+ unsigned b_size = ac_get_llvm_num_components(b);
+
+ LLVMValueRef *elems = alloca((a_size + b_size) * sizeof(LLVMValueRef));
+ for (unsigned i = 0; i < a_size; i++)
+ elems[i] = ac_llvm_extract_elem(ctx, a, i);
+ for (unsigned i = 0; i < b_size; i++)
+ elems[a_size + i] = ac_llvm_extract_elem(ctx, b, i);
+
+ return ac_build_gather_values(ctx, elems, a_size + b_size);
+}
+
/* Expand a scalar or vector to <dst_channels x type> by filling the remaining
* channels with undef. Extract at most src_channels components from the input.
*/
-static LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
- unsigned src_channels, unsigned dst_channels)
+LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
+ unsigned src_channels, unsigned dst_channels)
{
LLVMTypeRef elemtype;
LLVMValueRef *const chan = alloca(dst_channels * sizeof(LLVMValueRef));
@@ -1231,8 +1264,42 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
unsigned num_channels, unsigned cache_policy,
- bool can_speculate, bool d16)
+ bool can_speculate, bool d16, bool tfe)
{
+ if (tfe) {
+ assert(!d16);
+
+ char code[256];
+ /* The definition in the assembly and the one in the constraint string
+ * differs because of an assembler bug.
+ */
+ snprintf(code, sizeof(code),
+ "v_mov_b32 v0, 0\n"
+ "v_mov_b32 v1, 0\n"
+ "v_mov_b32 v2, 0\n"
+ "v_mov_b32 v3, 0\n"
+ "v_mov_b32 v4, 0\n"
+ "buffer_load_format_xyzw v[0:3], $1, $2, 0, idxen offen %s %s tfe %s\n"
+ "s_waitcnt vmcnt(0)",
+ cache_policy & ac_glc ? "glc" : "",
+ cache_policy & ac_slc ? "slc" : "",
+ cache_policy & ac_dlc ? "dlc" : "");
+
+ LLVMTypeRef param_types[] = {ctx->v2i32, ctx->v4i32};
+ LLVMTypeRef calltype = LLVMFunctionType(LLVMVectorType(ctx->f32, 5), param_types, 2, false);
+ LLVMValueRef inlineasm = LLVMConstInlineAsm(calltype, code, "=&{v[0:4]},v,s", false, false);
+
+ LLVMValueRef addr_comp[2] = {vindex ? vindex : ctx->i32_0,
+ voffset ? voffset : ctx->i32_0};
+
+ LLVMValueRef args[] = {ac_build_gather_values(ctx, addr_comp, 2),
+ LLVMBuildBitCast(ctx->builder, rsrc, ctx->v4i32, "")};
+ LLVMValueRef res = LLVMBuildCall(ctx->builder, inlineasm, args, 2, "");
+
+ return ac_build_concat(ctx, ac_trim_vector(ctx, res, num_channels),
+ ac_llvm_extract_elem(ctx, res, 4));
+ }
+
return ac_build_buffer_load_common(ctx, rsrc, vindex, voffset, ctx->i32_0, num_channels,
d16 ? ctx->f16 : ctx->f32, cache_policy, can_speculate, true,
true);
@@ -2120,7 +2187,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
LLVMTypeRef coord_type = sample ? ctx->f32 : ctx->i32;
uint8_t dmask = a->dmask;
LLVMTypeRef data_type;
- char data_type_str[8];
+ char data_type_str[32];
if (atomic) {
data_type = LLVMTypeOf(a->data[0]);
@@ -2132,6 +2199,11 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
data_type = a->d16 ? ctx->v4f16 : ctx->v4f32;
}
+ if (a->tfe) {
+ data_type = LLVMStructTypeInContext(
+ ctx->context, (LLVMTypeRef[]){data_type, ctx->i32}, 2, false);
+ }
+
if (atomic || a->opcode == ac_image_store || a->opcode == ac_image_store_mip) {
args[num_args++] = a->data[0];
if (a->opcode == ac_image_atomic_cmpswap)
@@ -2171,7 +2243,7 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
args[num_args++] = LLVMConstInt(ctx->i1, a->unorm, false);
}
- args[num_args++] = ctx->i32_0; /* texfailctrl */
+ args[num_args++] = a->tfe ? ctx->i32_1 : ctx->i32_0; /* texfailctrl */
args[num_args++] = LLVMConstInt(
ctx->i32, load ? get_load_cache_policy(ctx, a->cache_policy) : a->cache_policy, false);
@@ -2258,14 +2330,18 @@ LLVMValueRef ac_build_image_opcode(struct ac_llvm_context *ctx, struct ac_image_
data_type_str, overload[0], overload[1], overload[2]);
LLVMTypeRef retty;
- if (atomic)
- retty = data_type;
- else if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
+ if (a->opcode == ac_image_store || a->opcode == ac_image_store_mip)
retty = ctx->voidt;
else
- retty = a->d16 ? ctx->v4f16 : ctx->v4f32;
+ retty = data_type;
LLVMValueRef result = ac_build_intrinsic(ctx, intr_name, retty, args, num_args, a->attributes);
+ if (a->tfe) {
+ LLVMValueRef texel = LLVMBuildExtractValue(ctx->builder, result, 0, "");
+ LLVMValueRef code = LLVMBuildExtractValue(ctx->builder, result, 1, "");
+ result = ac_build_concat(ctx, texel, ac_to_float(ctx, code));
+ }
+
if (!sample && !atomic && retty != ctx->voidt)
result = ac_to_integer(ctx, result);
diff --git a/src/amd/llvm/ac_llvm_build.h b/src/amd/llvm/ac_llvm_build.h
index 8c84b38b73b..5a4a61a0e71 100644
--- a/src/amd/llvm/ac_llvm_build.h
+++ b/src/amd/llvm/ac_llvm_build.h
@@ -195,9 +195,14 @@ LLVMValueRef ac_build_gather_values_extended(struct ac_llvm_context *ctx, LLVMVa
LLVMValueRef ac_build_gather_values(struct ac_llvm_context *ctx, LLVMValueRef *values,
unsigned value_count);
+LLVMValueRef ac_build_concat(struct ac_llvm_context *ctx, LLVMValueRef a, LLVMValueRef b);
+
LLVMValueRef ac_extract_components(struct ac_llvm_context *ctx, LLVMValueRef value, unsigned start,
unsigned channels);
+LLVMValueRef ac_build_expand(struct ac_llvm_context *ctx, LLVMValueRef value,
+ unsigned src_channels, unsigned dst_channels);
+
LLVMValueRef ac_build_expand_to_vec4(struct ac_llvm_context *ctx, LLVMValueRef value,
unsigned num_channels);
LLVMValueRef ac_build_round(struct ac_llvm_context *ctx, LLVMValueRef value);
@@ -261,7 +266,7 @@ LLVMValueRef ac_build_buffer_load(struct ac_llvm_context *ctx, LLVMValueRef rsrc
LLVMValueRef ac_build_buffer_load_format(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef vindex, LLVMValueRef voffset,
unsigned num_channels, unsigned cache_policy,
- bool can_speculate, bool d16);
+ bool can_speculate, bool d16, bool tfe);
LLVMValueRef ac_build_tbuffer_load_short(struct ac_llvm_context *ctx, LLVMValueRef rsrc,
LLVMValueRef voffset, LLVMValueRef soffset,
@@ -399,6 +404,7 @@ struct ac_image_args {
bool unorm : 1;
bool level_zero : 1;
bool d16 : 1; /* data and return values are 16-bit, requires GFX8+ */
+ bool tfe : 1;
unsigned attributes; /* additional call-site specific AC_FUNC_ATTRs */
LLVMValueRef resource;
diff --git a/src/amd/llvm/ac_nir_to_llvm.c b/src/amd/llvm/ac_nir_to_llvm.c
index 0555fe695ba..b6c768f4b84 100644
--- a/src/amd/llvm/ac_nir_to_llvm.c
+++ b/src/amd/llvm/ac_nir_to_llvm.c
@@ -1424,13 +1424,16 @@ static nir_deref_instr *get_tex_texture_deref(const nir_tex_instr *instr)
static LLVMValueRef build_tex_intrinsic(struct ac_nir_context *ctx, const nir_tex_instr *instr,
struct ac_image_args *args)
{
+ assert((!args->tfe || !args->d16) && "unsupported");
+
if (instr->sampler_dim == GLSL_SAMPLER_DIM_BUF) {
unsigned mask = nir_ssa_def_components_read(&instr->dest.ssa);
assert(instr->dest.is_ssa);
return ac_build_buffer_load_format(&ctx->ac, args->resource, args->coords[0], ctx->ac.i32_0,
util_last_bit(mask), 0, true,
- instr->dest.ssa.bit_size == 16);
+ instr->dest.ssa.bit_size == 16,
+ args->tfe);
}
args->opcode = ac_image_sample;
@@ -2298,7 +2301,9 @@ static void get_image_coords(struct ac_nir_context *ctx, const nir_intrinsic_ins
count = image_type_to_components_count(dim, is_array);
if (is_ms && (instr->intrinsic == nir_intrinsic_image_deref_load ||
- instr->intrinsic == nir_intrinsic_bindless_image_load)) {
+ instr->intrinsic == nir_intrinsic_bindless_image_load ||
+ instr->intrinsic == nir_intrinsic_image_deref_sparse_load ||
+ instr->intrinsic == nir_intrinsic_bindless_image_sparse_load)) {
LLVMValueRef fmask_load_address[3];
fmask_load_address[0] = LLVMBuildExtractElement(ctx->ac.builder, src0, masks[0], "");
@@ -2420,6 +2425,7 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
struct ac_image_args args = {0};
args.cache_policy = get_cache_policy(ctx, access, false, false);
+ args.tfe = instr->intrinsic == nir_intrinsic_image_deref_sparse_load;
if (dim == GLSL_SAMPLER_DIM_BUF) {
unsigned num_channels = util_last_bit(nir_ssa_def_components_read(&instr->dest.ssa));
@@ -2435,8 +2441,9 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
bool can_speculate = access & ACCESS_CAN_REORDER;
res = ac_build_buffer_load_format(&ctx->ac, rsrc, vindex, ctx->ac.i32_0, num_channels,
args.cache_policy, can_speculate,
- instr->dest.ssa.bit_size == 16);
- res = ac_build_expand_to_vec4(&ctx->ac, res, num_channels);
+ instr->dest.ssa.bit_size == 16,
+ args.tfe);
+ res = ac_build_expand(&ctx->ac, res, num_channels, args.tfe ? 5 : 4);
res = ac_trim_vector(&ctx->ac, res, instr->dest.ssa.num_components);
res = ac_to_integer(&ctx->ac, res);
@@ -2459,12 +2466,20 @@ static LLVMValueRef visit_image_load(struct ac_nir_context *ctx, const nir_intri
}
if (instr->dest.ssa.bit_size == 64) {
+ LLVMValueRef code = NULL;
+ if (args.tfe) {
+ code = ac_llvm_extract_elem(&ctx->ac, res, 4);
+ res = ac_trim_vector(&ctx->ac, res, 4);
+ }
+
res = LLVMBuildBitCast(ctx->ac.builder, res, LLVMVectorType(ctx->ac.i64, 2), "");
LLVMValueRef x = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_0, "");
LLVMValueRef w = LLVMBuildExtractElement(ctx->ac.builder, res, ctx->ac.i32_1, "");
- LLVMValueRef values[4] = {x, ctx->ac.i64_0, ctx->ac.i64_0, w};
- res = ac_build_gather_values(&ctx->ac, values, 4);
+ if (code)
+ code = LLVMBuildZExt(ctx->ac.builder, code, ctx->ac.i64, "");
+ LLVMValueRef values[5] = {x, ctx->ac.i64_0, ctx->ac.i64_0, w, code};
+ res = ac_build_gather_values(&ctx->ac, values, 4 + args.tfe);
}
return exit_waterfall(ctx, &wctx, res);
@@ -3583,6 +3598,7 @@ static void visit_intrinsic(struct ac_nir_context *ctx, nir_intrinsic_instr *ins
result = visit_image_load(ctx, instr, true);
break;
case nir_intrinsic_image_deref_load:
+ case nir_intrinsic_image_deref_sparse_load:
result = visit_image_load(ctx, instr, false);
break;
case nir_intrinsic_bindless_image_store:
@@ -4441,9 +4457,16 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
assert(instr->dest.is_ssa);
args.d16 = instr->dest.ssa.bit_size == 16;
+ args.tfe = instr->is_sparse;
result = build_tex_intrinsic(ctx, instr, &args);
+ LLVMValueRef code = NULL;
+ if (instr->is_sparse) {
+ code = ac_llvm_extract_elem(&ctx->ac, result, 4);
+ result = ac_trim_vector(&ctx->ac, result, 4);
+ }
+
if (instr->op == nir_texop_query_levels)
result =
LLVMBuildExtractElement(ctx->ac.builder, result, LLVMConstInt(ctx->ac.i32, 3, false), "");
@@ -4462,9 +4485,12 @@ static void visit_tex(struct ac_nir_context *ctx, nir_tex_instr *instr)
LLVMValueRef two = LLVMConstInt(ctx->ac.i32, 2, false);
LLVMValueRef layers = LLVMBuildExtractElement(ctx->ac.builder, result, two, "");
result = LLVMBuildInsertElement(ctx->ac.builder, result, layers, ctx->ac.i32_1, "");
- } else if (instr->dest.ssa.num_components != 4)
+ } else if (nir_tex_instr_result_size(instr) != 4)
result = ac_trim_vector(&ctx->ac, result, instr->dest.ssa.num_components);
+ if (instr->is_sparse)
+ result = ac_build_concat(&ctx->ac, result, code);
+
write_result:
if (result) {
assert(instr->dest.is_ssa);
diff --git a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
index 10e46687de1..4c94f2c53e3 100644
--- a/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
+++ b/src/gallium/drivers/radeonsi/si_compute_prim_discard.c
@@ -460,7 +460,7 @@ void si_build_prim_discard_compute_shader(struct si_shader_context *ctx)
if (key->opt.cs_indexed) {
for (unsigned i = 0; i < 3; i++) {
index[i] = ac_build_buffer_load_format(&ctx->ac, input_indexbuf, index[i], ctx->ac.i32_0,
- 1, 0, true, false);
+ 1, 0, true, false, false);
index[i] = ac_to_integer(&ctx->ac, index[i]);
}
}
diff --git a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
index c280b585ea9..19c011f2e43 100644
--- a/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
+++ b/src/gallium/drivers/radeonsi/si_shader_llvm_vs.c
@@ -158,7 +158,7 @@ static void load_input_vs(struct si_shader_context *ctx, unsigned input_index, L
for (unsigned i = 0; i < num_fetches; ++i) {
LLVMValueRef voffset = LLVMConstInt(ctx->ac.i32, fetch_stride * i, 0);
fetches[i] = ac_build_buffer_load_format(&ctx->ac, vb_desc, vertex_index, voffset,
- channels_per_fetch, 0, true, false);
+ channels_per_fetch, 0, true, false, false);
}
if (num_fetches == 1 && channels_per_fetch > 1) {
More information about the mesa-commit
mailing list