[Freedreno] [RFC 3/4] ir3/nir: Add a new pass 'ir3_nir_lower_sampler_io'
Eduardo Lima Mitev
elima at igalia.com
Fri Jan 25 15:48:32 UTC 2019
This pass moves to NIR some offset calculations that are currently
implemented on the backend compiler, to allow NIR to possibly
optimize them.
For now, only coordinate byte-offset calculations for imageStore
and image atomic operations are implemented.
---
src/freedreno/Makefile.sources | 1 +
src/freedreno/ir3/ir3_nir.h | 1 +
src/freedreno/ir3/ir3_nir_lower_sampler_io.c | 349 +++++++++++++++++++
3 files changed, 351 insertions(+)
create mode 100644 src/freedreno/ir3/ir3_nir_lower_sampler_io.c
diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 7fea9de39ef..fd4f7f294cd 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -31,6 +31,7 @@ ir3_SOURCES := \
ir3/ir3_legalize.c \
ir3/ir3_nir.c \
ir3/ir3_nir.h \
+ ir3/ir3_nir_lower_sampler_io.c \
ir3/ir3_nir_lower_tg4_to_tex.c \
ir3/ir3_print.c \
ir3/ir3_ra.c \
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 74201d34160..52809ba099e 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo
bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+bool ir3_nir_lower_sampler_io(nir_shader *shader);
const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
diff --git a/src/freedreno/ir3/ir3_nir_lower_sampler_io.c b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
new file mode 100644
index 00000000000..e2910d8906d
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2018 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The goal of this pass is to move to NIR some offset calculations for
+ * different I/O that are currently implemented on the backend compiler,
+ * to allow NIR to possibly optimize them.
+ *
+ * Currently, only offset calculations for image store and image
+ * atomic operations are implemented.
+ */
+
+
+/* This flag enables/disables a code-path where the bytes-per-pixel of
+ * an image is obtained directly from the format, which is known
+ * at shader compile time; as opposed to using image_dims[0] constant
+ * available only at shader runtime.
+ *
+ * Inlining the bytes-per-pixel here as an immediate has the advantage
+ * that it gets converted to a single (SHL) instruction (because all
+ * possible values are powers of two); while loading it as a uniform and
+ * emitting an IMUL will cause the backend to expand it to quite a few
+ * instructions (see ir3_compiler_nir for imul), thus ultimately hurting
+ * instruction count.
+ */
+#define INLINE_BPP 1
+
+
+static bool
+intrinsic_is_image_atomic(unsigned intrinsic)
+{
+ switch (intrinsic) {
+ case nir_intrinsic_image_deref_atomic_add:
+ case nir_intrinsic_image_deref_atomic_min:
+ case nir_intrinsic_image_deref_atomic_max:
+ case nir_intrinsic_image_deref_atomic_and:
+ case nir_intrinsic_image_deref_atomic_or:
+ case nir_intrinsic_image_deref_atomic_xor:
+ case nir_intrinsic_image_deref_atomic_exchange:
+ case nir_intrinsic_image_deref_atomic_comp_swap:
+ return true;
+ default:
+ break;
+ }
+
+ return false;
+}
+
+static bool
+intrinsic_is_image_store_or_atomic(unsigned intrinsic)
+{
+ if (intrinsic == nir_intrinsic_image_deref_store)
+ return true;
+ else
+ return intrinsic_is_image_atomic(intrinsic);
+}
+
+/*
+ * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
+ * out at some point.
+ */
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+ const struct glsl_type *type = glsl_without_array(var->type);
+ unsigned coords;
+
+ switch (glsl_get_sampler_dim(type)) {
+ case GLSL_SAMPLER_DIM_1D:
+ case GLSL_SAMPLER_DIM_BUF:
+ coords = 1;
+ break;
+ case GLSL_SAMPLER_DIM_2D:
+ case GLSL_SAMPLER_DIM_RECT:
+ case GLSL_SAMPLER_DIM_EXTERNAL:
+ case GLSL_SAMPLER_DIM_MS:
+ coords = 2;
+ break;
+ case GLSL_SAMPLER_DIM_3D:
+ case GLSL_SAMPLER_DIM_CUBE:
+ coords = 3;
+ break;
+ default:
+ unreachable("bad sampler dim");
+ return 0;
+ }
+
+ if (glsl_sampler_type_is_array(type)) {
+ /* note: unlike tex_info(), adjust # of coords to include array idx: */
+ coords++;
+ }
+
+ return coords;
+}
+
+#if INLINE_BPP
+/* Returns the bytes-per-pixel for the different GL formats corresponding to
+ * all supported image formats.
+ */
+static unsigned
+bytes_per_pixel_for_gl_format(GLuint format)
+{
+ switch (format) {
+ case GL_R8I:
+ case GL_R8UI:
+ case GL_R8:
+ case GL_R8_SNORM:
+ return 1;
+
+ case GL_R16F:
+ case GL_R16I:
+ case GL_R16UI:
+ case GL_R16:
+ case GL_R16_SNORM:
+ case GL_RG8I:
+ case GL_RG8UI:
+ case GL_RG8:
+ case GL_RG8_SNORM:
+ return 2;
+
+ case GL_R32F:
+ case GL_R32I:
+ case GL_R32UI:
+ case GL_RG16F:
+ case GL_RG16I:
+ case GL_RG16UI:
+ case GL_RG16:
+ case GL_RG16_SNORM:
+ case GL_RGBA8I:
+ case GL_RGBA8UI:
+ case GL_RGBA8:
+ case GL_RGBA8_SNORM:
+ case GL_RGB10_A2UI:
+ case GL_RGB10_A2:
+ case GL_R11F_G11F_B10F:
+ return 4;
+
+ case GL_RG32F:
+ case GL_RG32I:
+ case GL_RG32UI:
+ case GL_RGBA16F:
+ case GL_RGBA16I:
+ case GL_RGBA16UI:
+ case GL_RGBA16:
+ case GL_RGBA16_SNORM:
+ return 8;
+
+ case GL_RGBA32F:
+ case GL_RGBA32I:
+ case GL_RGBA32UI:
+ return 16;
+
+ default:
+ debug_assert(!"Unhandled GL format");
+ }
+
+ return 0;
+}
+#endif /* INLINE_BPP */
+
+static nir_ssa_def *
+insert_load_image_stride(nir_builder *b, unsigned image_index,
+ unsigned dimension)
+{
+ nir_ssa_def *zero = nir_imm_int(b, 0);
+ nir_intrinsic_instr *load =
+ nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_image_stride);
+ load->num_components = 1;
+ load->const_index[0] = image_index;
+ load->const_index[1] = dimension;
+ load->src[0] = nir_src_for_ssa(zero);
+ nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+
+ nir_builder_instr_insert(b, &load->instr);
+
+ return &load->dest.ssa;
+}
+
+static void
+lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
+ const nir_variable *var, nir_builder *b,
+ void *mem_ctx)
+{
+ /* Find the instruction that defines the coord source of the image
+ * store/atomic intrinsic. It must be a "vec4" ALU instruction.
+ */
+ debug_assert(intrinsic->src[1].is_ssa);
+ nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
+
+ nir_instr *offset_parent_instr = offset_src_def->parent_instr;
+ debug_assert(offset_parent_instr->type == nir_instr_type_alu);
+
+ nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
+ debug_assert(vec4_instr->op == nir_op_vec4);
+
+ unsigned coords = get_image_coords(var);
+
+ b->cursor = nir_before_instr(&vec4_instr->instr);
+
+ /* These are actually offsets into image_dims register file (for
+ * a given image).
+ */
+ enum {
+ BYTES_PER_PIXEL = 0,
+ Y_STRIDE = 1,
+ Z_STRIDE = 2
+ };
+
+ /* x_offset = coords.x * bytes_per_pixel */
+ nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
+#if INLINE_BPP
+ unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
+ nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
+#else
+ nir_ssa_def *bpp =
+ insert_load_image_stride(b, var->data.driver_location, BYTES_PER_PIXEL);
+ nir_ssa_def *offset = nir_imul(b, x_coord, bpp);
+#endif
+ nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
+ debug_assert(imul);
+ imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
+ debug_assert(offset);
+
+ /* For Y and Z dimensions, we emit a temporary load_image_stride
+ * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which
+ * will just emit an uniform with the right value from image_dims[].
+ */
+
+ if (coords > 1) {
+ nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
+ nir_ssa_def *y_stride =
+ insert_load_image_stride(b, var->data.driver_location, Y_STRIDE);
+
+ /* y_offset = coords.y * y_stride + x_offset */
+ offset = nir_imad(b, y_stride, y_coord, offset);
+ debug_assert(offset);
+ nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+ debug_assert(imad);
+ imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
+ }
+
+ if (coords > 2) {
+ nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
+ nir_ssa_def *z_stride =
+ insert_load_image_stride(b, var->data.driver_location, Z_STRIDE);
+
+ /* z_offset = coords.z * z_stride + y_offset */
+ offset = nir_imad(b, z_stride, z_coord, offset);
+ debug_assert(offset);
+ nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+ debug_assert(imad);
+ imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
+ }
+
+ if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
+ /* Some cases, like atomics, seem to use dword offset instead
+ * of byte offsets.. blob just puts an extra shr.b in there
+ * in those cases:
+ */
+ nir_ssa_def *two = nir_imm_int(b, 2);
+ offset = nir_ushr(b, offset, two);
+ }
+
+ /* Finally, store the calculate offset in the 4th component of the
+ * vec4 instruction. We use the 4th coordinate because it is the
+ * one we know for sure is not used.
+ */
+ nir_instr_rewrite_src(&vec4_instr->instr,
+ &vec4_instr->src[3].src,
+ nir_src_for_ssa(offset));
+}
+
+static bool
+lower_sampler_io_block(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+ bool progress = false;
+
+ nir_foreach_instr_safe(instr, block) {
+ if (instr->type != nir_instr_type_intrinsic)
+ continue;
+
+ nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+ if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
+ continue;
+
+ const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+ lower_offset_for_image_store_or_atomic(intr, var, b, mem_ctx);
+
+ progress = true;
+ }
+
+ return progress;
+}
+
+static bool
+lower_sampler_io_func(nir_function_impl *impl)
+{
+ void *mem_ctx = ralloc_parent(impl);
+ nir_builder b;
+ nir_builder_init(&b, impl);
+
+ bool progress = false;
+ nir_foreach_block_safe(block, impl) {
+ progress |= lower_sampler_io_block(block, &b, mem_ctx);
+ }
+
+ if (progress) {
+ nir_metadata_preserve(impl, nir_metadata_block_index |
+ nir_metadata_dominance);
+ }
+
+ return progress;
+}
+
+bool
+ir3_nir_lower_sampler_io(nir_shader *shader)
+{
+ bool progress = false;
+
+ nir_foreach_function(function, shader) {
+ if (function->impl)
+ progress |= lower_sampler_io_func(function->impl);
+ }
+
+ return progress;
+}
--
2.20.1
More information about the Freedreno
mailing list