[Mesa-dev] [PATCH 5/9] ir3/nir: Add a new pass 'ir3_nir_lower_io_offsets'

Wed Feb 13 21:29:52 UTC 2019

This pass moves to NIR some offset computations that are currently
implemented on the IR3 backend compiler, to allow NIR to possibly
optimize them.

For now, it only supports lowering byte-offset computation for image
store and atomics.
---
 src/freedreno/Makefile.sources               |   1 +
 src/freedreno/ir3/ir3_nir.h                  |   1 +
 src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 334 +++++++++++++++++++
 3 files changed, 336 insertions(+)
 create mode 100644 src/freedreno/ir3/ir3_nir_lower_io_offsets.c

diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 7fea9de39ef..235fec1c4f2 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -31,6 +31,7 @@ ir3_SOURCES := \
 	ir3/ir3_legalize.c \
 	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
+	ir3/ir3_nir_lower_io_offsets.c \
 	ir3/ir3_nir_lower_tg4_to_tex.c \
 	ir3/ir3_print.c \
 	ir3/ir3_ra.c \
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 74201d34160..7983b74af2c 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo
 
 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
 bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+bool ir3_nir_lower_io_offsets(nir_shader *shader);
 
 const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
new file mode 100644
index 00000000000..a43b3895fd8
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
@@ -0,0 +1,334 @@
+/*
+ * Copyright © 2018-2019 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * This pass moves to NIR certain offset computations for different I/O
+ * ops that are currently implemented on the IR3 backend compiler, to
+ * give NIR a chance to optimize them:
+ *
+ * - Byte-offset for image store and atomics: Emit instructions to
+ *   compute (x*bpp) + y*y_stride + z*z_stride), and place the resulting
+ *   SSA value in the 4th-component of the vec4 instruction that defines
+ *   the offset.
+ */
+
+
+static bool
+intrinsic_is_image_atomic(unsigned intrinsic)
+{
+	switch (intrinsic) {
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+static bool
+intrinsic_is_image_store_or_atomic(unsigned intrinsic)
+{
+	if (intrinsic == nir_intrinsic_image_deref_store)
+		return true;
+	else
+		return intrinsic_is_image_atomic(intrinsic);
+}
+
+/*
+ * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
+ * out at some point.
+ */
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+	const struct glsl_type *type = glsl_without_array(var->type);
+	unsigned coords;
+
+	switch (glsl_get_sampler_dim(type)) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		coords = 3;
+		break;
+	default:
+		unreachable("bad sampler dim");
+		return 0;
+	}
+
+	if (glsl_sampler_type_is_array(type)) {
+		/* adjust # of coords to include array idx: */
+		coords++;
+	}
+
+	return coords;
+}
+
+/* Returns the bytes-per-pixel for the different GL formats corresponding to
+ * all supported image formats.
+ */
+static unsigned
+bytes_per_pixel_for_gl_format(GLuint format)
+{
+	switch (format) {
+	case GL_R8I:
+	case GL_R8UI:
+	case GL_R8:
+	case GL_R8_SNORM:
+		return 1;
+
+	case GL_R16F:
+	case GL_R16I:
+	case GL_R16UI:
+	case GL_R16:
+	case GL_R16_SNORM:
+	case GL_RG8I:
+	case GL_RG8UI:
+	case GL_RG8:
+	case GL_RG8_SNORM:
+		return 2;
+
+	case GL_R32F:
+	case GL_R32I:
+	case GL_R32UI:
+	case GL_RG16F:
+	case GL_RG16I:
+	case GL_RG16UI:
+	case GL_RG16:
+	case GL_RG16_SNORM:
+	case GL_RGBA8I:
+	case GL_RGBA8UI:
+	case GL_RGBA8:
+	case GL_RGBA8_SNORM:
+	case GL_RGB10_A2UI:
+	case GL_RGB10_A2:
+	case GL_R11F_G11F_B10F:
+		return 4;
+
+	case GL_RG32F:
+	case GL_RG32I:
+	case GL_RG32UI:
+	case GL_RGBA16F:
+	case GL_RGBA16I:
+	case GL_RGBA16UI:
+	case GL_RGBA16:
+	case GL_RGBA16_SNORM:
+		return 8;
+
+	case GL_RGBA32F:
+	case GL_RGBA32I:
+	case GL_RGBA32UI:
+		return 16;
+
+	default:
+		debug_assert(!"Unhandled GL format");
+	}
+
+	return 0;
+}
+
+static nir_ssa_def *
+insert_load_image_param(nir_builder *b, nir_ssa_def *image_deref,
+						unsigned dimension)
+{
+	nir_intrinsic_instr *load =
+		nir_intrinsic_instr_create(b->shader,
+								   nir_intrinsic_image_deref_load_param_ir3);
+	load->num_components = 1;
+	load->const_index[0] = dimension;
+	load->src[0] = nir_src_for_ssa(image_deref);
+	nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+
+	nir_builder_instr_insert(b, &load->instr);
+
+	return &load->dest.ssa;
+}
+
+static bool
+lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
+									   const nir_variable *var, nir_builder *b,
+									   void *mem_ctx)
+{
+	nir_ssa_def *image_deref;
+
+	/* Find the instruction that defines the coord source of the image
+	 * store/atomic intrinsic. It must be a "vec4" ALU instruction.
+	 */
+	debug_assert(intrinsic->src[1].is_ssa);
+	nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
+
+	nir_instr *offset_parent_instr = offset_src_def->parent_instr;
+	debug_assert(offset_parent_instr->type == nir_instr_type_alu);
+
+	nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
+	debug_assert(vec4_instr->op == nir_op_vec4);
+
+	unsigned coords = get_image_coords(var);
+
+	b->cursor = nir_before_instr(&vec4_instr->instr);
+
+	/* These are actually offsets into image_dims register file (for
+	 * a given image).
+	 */
+	enum {
+		BYTES_PER_PIXEL = 0,
+		Y_STRIDE        = 1,
+		Z_STRIDE        = 2
+	};
+
+	/* x_offset = coord.x * bytes_per_pixel */
+	nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
+	unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
+	nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
+	nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
+	debug_assert(imul);
+	imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
+	debug_assert(offset);
+
+	/* For Y and Z dimensions, we emit a temporary image_deref_load_param
+	 * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which
+	 * will just emit an uniform with the right value from image_dims[].
+	 */
+
+	if (coords > 1) {
+		/* src0 of the intrinsic is the dereferenced image. */
+		debug_assert(intrinsic->src[0].is_ssa);
+		image_deref = intrinsic->src[0].ssa;
+
+		nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
+		nir_ssa_def *y_stride =
+			insert_load_image_param(b, image_deref, Y_STRIDE);
+
+		/* y_offset = coord.y * y_stride + x_offset */
+		offset = nir_imad24_ir3(b, y_stride, y_coord, offset);
+		debug_assert(offset);
+		nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+		debug_assert(imad);
+		imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
+
+		if (coords > 2) {
+			nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
+			nir_ssa_def *z_stride =
+				insert_load_image_param(b, image_deref, Z_STRIDE);
+
+			/* z_offset = coord.z * z_stride + y_offset */
+			offset = nir_imad24_ir3(b, z_stride, z_coord, offset);
+			debug_assert(offset);
+			nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+			debug_assert(imad);
+			imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
+		}
+	}
+
+	if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
+		/* Some cases, like atomics, seem to use dword offset instead
+		 * of byte offsets.. blob just puts an extra shr.b in there
+		 * in those cases:
+		 */
+		nir_ssa_def *two = nir_imm_int(b, 2);
+		offset = nir_ushr(b, offset, two);
+	}
+
+	/* Finally, store the computed offset in the 4th component of the
+	 * vec4 instruction, which is the only one guaranteed not to be used.
+	 * We don't want to override the original coordinate because it is
+	 * still needed in the backend.
+	 */
+	nir_instr_rewrite_src(&vec4_instr->instr,
+						  &vec4_instr->src[3].src,
+						  nir_src_for_ssa(offset));
+	return true;
+}
+
+static bool
+lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+	bool progress = false;
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+		if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
+			continue;
+
+		const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+		progress |= lower_offset_for_image_store_or_atomic(intr, var, b,
+														   mem_ctx);
+	}
+
+	return progress;
+}
+
+static bool
+lower_io_offsets_func(nir_function_impl *impl)
+{
+	void *mem_ctx = ralloc_parent(impl);
+	nir_builder b;
+	nir_builder_init(&b, impl);
+
+	bool progress = false;
+	nir_foreach_block_safe(block, impl) {
+		progress |= lower_io_offsets_block(block, &b, mem_ctx);
+	}
+
+	if (progress) {
+		nir_metadata_preserve(impl, nir_metadata_block_index |
+									nir_metadata_dominance);
+	}
+
+	return progress;
+}
+
+bool
+ir3_nir_lower_io_offsets(nir_shader *shader)
+{
+	bool progress = false;
+
+	nir_foreach_function(function, shader) {
+		if (function->impl)
+			progress |= lower_io_offsets_func(function->impl);
+	}
+
+	return progress;
+}
-- 
2.20.1