[Freedreno] [RFC 3/4] ir3/nir: Add a new pass 'ir3_nir_lower_sampler_io'

Fri Jan 25 15:48:32 UTC 2019

This pass moves to NIR some offset calculations that are currently
implemented on the backend compiler, to allow NIR to possibly
optimize them.

For now, only coordinate byte-offset calculations for imageStore
and image atomic operations are implemented.
---
 src/freedreno/Makefile.sources               |   1 +
 src/freedreno/ir3/ir3_nir.h                  |   1 +
 src/freedreno/ir3/ir3_nir_lower_sampler_io.c | 349 +++++++++++++++++++
 3 files changed, 351 insertions(+)
 create mode 100644 src/freedreno/ir3/ir3_nir_lower_sampler_io.c

diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
index 7fea9de39ef..fd4f7f294cd 100644
--- a/src/freedreno/Makefile.sources
+++ b/src/freedreno/Makefile.sources
@@ -31,6 +31,7 @@ ir3_SOURCES := \
 	ir3/ir3_legalize.c \
 	ir3/ir3_nir.c \
 	ir3/ir3_nir.h \
+	ir3/ir3_nir_lower_sampler_io.c \
 	ir3/ir3_nir_lower_tg4_to_tex.c \
 	ir3/ir3_print.c \
 	ir3/ir3_ra.c \
diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
index 74201d34160..52809ba099e 100644
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo
 
 bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
 bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
+bool ir3_nir_lower_sampler_io(nir_shader *shader);
 
 const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
 bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
diff --git a/src/freedreno/ir3/ir3_nir_lower_sampler_io.c b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
new file mode 100644
index 00000000000..e2910d8906d
--- /dev/null
+++ b/src/freedreno/ir3/ir3_nir_lower_sampler_io.c
@@ -0,0 +1,349 @@
+/*
+ * Copyright © 2018 Igalia S.L.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a
+ * copy of this software and associated documentation files (the "Software"),
+ * to deal in the Software without restriction, including without limitation
+ * the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ * and/or sell copies of the Software, and to permit persons to whom the
+ * Software is furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice (including the next
+ * paragraph) shall be included in all copies or substantial portions of the
+ * Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
+ * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
+ * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+ * IN THE SOFTWARE.
+ */
+
+#include "ir3_nir.h"
+#include "compiler/nir/nir_builder.h"
+
+/**
+ * The goal of this pass is to move to NIR some offset calculations for
+ * different I/O that are currently implemented on the backend compiler,
+ * to allow NIR to possibly optimize them.
+ *
+ * Currently, only offset calculations for image store and image
+ * atomic operations are implemented.
+ */
+
+
+/* This flag enables/disables a code-path where the bytes-per-pixel of
+ * an image is obtained directly from the format, which is known
+ * at shader compile time; as opposed to using image_dims[0] constant
+ * available only at shader runtime.
+ *
+ * Inlining the bytes-per-pixel here as an immediate has the advantage
+ * that it gets converted to a single (SHL) instruction (because all
+ * possible values are powers of two); while loading it as a uniform and
+ * emitting an IMUL will cause the backend to expand it to quite a few
+ * instructions (see ir3_compiler_nir for imul), thus ultimately hurting
+ * instruction count.
+ */
+#define INLINE_BPP 1
+
+
+static bool
+intrinsic_is_image_atomic(unsigned intrinsic)
+{
+	switch (intrinsic) {
+	case nir_intrinsic_image_deref_atomic_add:
+	case nir_intrinsic_image_deref_atomic_min:
+	case nir_intrinsic_image_deref_atomic_max:
+	case nir_intrinsic_image_deref_atomic_and:
+	case nir_intrinsic_image_deref_atomic_or:
+	case nir_intrinsic_image_deref_atomic_xor:
+	case nir_intrinsic_image_deref_atomic_exchange:
+	case nir_intrinsic_image_deref_atomic_comp_swap:
+		return true;
+	default:
+		break;
+	}
+
+	return false;
+}
+
+static bool
+intrinsic_is_image_store_or_atomic(unsigned intrinsic)
+{
+	if (intrinsic == nir_intrinsic_image_deref_store)
+		return true;
+	else
+		return intrinsic_is_image_atomic(intrinsic);
+}
+
+/*
+ * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
+ * out at some point.
+ */
+static unsigned
+get_image_coords(const nir_variable *var)
+{
+	const struct glsl_type *type = glsl_without_array(var->type);
+	unsigned coords;
+
+	switch (glsl_get_sampler_dim(type)) {
+	case GLSL_SAMPLER_DIM_1D:
+	case GLSL_SAMPLER_DIM_BUF:
+		coords = 1;
+		break;
+	case GLSL_SAMPLER_DIM_2D:
+	case GLSL_SAMPLER_DIM_RECT:
+	case GLSL_SAMPLER_DIM_EXTERNAL:
+	case GLSL_SAMPLER_DIM_MS:
+		coords = 2;
+		break;
+	case GLSL_SAMPLER_DIM_3D:
+	case GLSL_SAMPLER_DIM_CUBE:
+		coords = 3;
+		break;
+	default:
+		unreachable("bad sampler dim");
+		return 0;
+	}
+
+	if (glsl_sampler_type_is_array(type)) {
+		/* note: unlike tex_info(), adjust # of coords to include array idx: */
+		coords++;
+	}
+
+	return coords;
+}
+
+#if INLINE_BPP
+/* Returns the bytes-per-pixel for the different GL formats corresponding to
+ * all supported image formats.
+ */
+static unsigned
+bytes_per_pixel_for_gl_format(GLuint format)
+{
+	switch (format) {
+	case GL_R8I:
+	case GL_R8UI:
+	case GL_R8:
+	case GL_R8_SNORM:
+		return 1;
+
+	case GL_R16F:
+	case GL_R16I:
+	case GL_R16UI:
+	case GL_R16:
+	case GL_R16_SNORM:
+	case GL_RG8I:
+	case GL_RG8UI:
+	case GL_RG8:
+	case GL_RG8_SNORM:
+		return 2;
+
+	case GL_R32F:
+	case GL_R32I:
+	case GL_R32UI:
+	case GL_RG16F:
+	case GL_RG16I:
+	case GL_RG16UI:
+	case GL_RG16:
+	case GL_RG16_SNORM:
+	case GL_RGBA8I:
+	case GL_RGBA8UI:
+	case GL_RGBA8:
+	case GL_RGBA8_SNORM:
+	case GL_RGB10_A2UI:
+	case GL_RGB10_A2:
+	case GL_R11F_G11F_B10F:
+		return 4;
+
+	case GL_RG32F:
+	case GL_RG32I:
+	case GL_RG32UI:
+	case GL_RGBA16F:
+	case GL_RGBA16I:
+	case GL_RGBA16UI:
+	case GL_RGBA16:
+	case GL_RGBA16_SNORM:
+		return 8;
+
+	case GL_RGBA32F:
+	case GL_RGBA32I:
+	case GL_RGBA32UI:
+		return 16;
+
+	default:
+		debug_assert(!"Unhandled GL format");
+	}
+
+	return 0;
+}
+#endif /* INLINE_BPP */
+
+static nir_ssa_def *
+insert_load_image_stride(nir_builder *b, unsigned image_index,
+						 unsigned dimension)
+{
+	nir_ssa_def *zero = nir_imm_int(b, 0);
+	nir_intrinsic_instr *load =
+		nir_intrinsic_instr_create(b->shader, nir_intrinsic_load_image_stride);
+	load->num_components = 1;
+	load->const_index[0] = image_index;
+	load->const_index[1] = dimension;
+	load->src[0] = nir_src_for_ssa(zero);
+	nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
+
+	nir_builder_instr_insert(b, &load->instr);
+
+	return &load->dest.ssa;
+}
+
+static void
+lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
+									   const nir_variable *var, nir_builder *b,
+									   void *mem_ctx)
+{
+	/* Find the instruction that defines the coord source of the image
+	 * store/atomic intrinsic. It must be a "vec4" ALU instruction.
+	 */
+	debug_assert(intrinsic->src[1].is_ssa);
+	nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
+
+	nir_instr *offset_parent_instr = offset_src_def->parent_instr;
+	debug_assert(offset_parent_instr->type == nir_instr_type_alu);
+
+	nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
+	debug_assert(vec4_instr->op == nir_op_vec4);
+
+	unsigned coords = get_image_coords(var);
+
+	b->cursor = nir_before_instr(&vec4_instr->instr);
+
+	/* These are actually offsets into image_dims register file (for
+	 * a given image).
+	 */
+	enum {
+		BYTES_PER_PIXEL = 0,
+		Y_STRIDE        = 1,
+		Z_STRIDE        = 2
+	};
+
+	/* x_offset = coords.x * bytes_per_pixel */
+	nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
+#if INLINE_BPP
+	unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
+	nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
+#else
+	nir_ssa_def *bpp =
+		insert_load_image_stride(b, var->data.driver_location, BYTES_PER_PIXEL);
+	nir_ssa_def *offset = nir_imul(b, x_coord, bpp);
+#endif
+	nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
+	debug_assert(imul);
+	imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
+	debug_assert(offset);
+
+	/* For Y and Z dimensions, we emit a temporary load_image_stride
+	 * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which
+	 * will just emit an uniform with the right value from image_dims[].
+	 */
+
+	if (coords > 1) {
+		nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
+		nir_ssa_def *y_stride =
+			insert_load_image_stride(b, var->data.driver_location, Y_STRIDE);
+
+		/* y_offset = coords.y * y_stride + x_offset */
+		offset = nir_imad(b, y_stride, y_coord, offset);
+		debug_assert(offset);
+		nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+		debug_assert(imad);
+		imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
+	}
+
+	if (coords > 2) {
+		nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
+		nir_ssa_def *z_stride =
+			insert_load_image_stride(b, var->data.driver_location, Z_STRIDE);
+
+		/* z_offset = coords.z * z_stride + y_offset */
+		offset = nir_imad(b, z_stride, z_coord, offset);
+		debug_assert(offset);
+		nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
+		debug_assert(imad);
+		imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
+	}
+
+	if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
+		/* Some cases, like atomics, seem to use dword offset instead
+		 * of byte offsets.. blob just puts an extra shr.b in there
+		 * in those cases:
+		 */
+		nir_ssa_def *two = nir_imm_int(b, 2);
+		offset = nir_ushr(b, offset, two);
+	}
+
+	/* Finally, store the calculate offset in the 4th component of the
+	 * vec4 instruction. We use the 4th coordinate because it is the
+	 * one we know for sure is not used.
+	 */
+	nir_instr_rewrite_src(&vec4_instr->instr,
+						  &vec4_instr->src[3].src,
+						  nir_src_for_ssa(offset));
+}
+
+static bool
+lower_sampler_io_block(nir_block *block, nir_builder *b, void *mem_ctx)
+{
+	bool progress = false;
+
+	nir_foreach_instr_safe(instr, block) {
+		if (instr->type != nir_instr_type_intrinsic)
+			continue;
+
+		nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
+		if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
+			continue;
+
+		const nir_variable *var = nir_intrinsic_get_var(intr, 0);
+		lower_offset_for_image_store_or_atomic(intr, var, b, mem_ctx);
+
+		progress = true;
+	}
+
+	return progress;
+}
+
+static bool
+lower_sampler_io_func(nir_function_impl *impl)
+{
+	void *mem_ctx = ralloc_parent(impl);
+	nir_builder b;
+	nir_builder_init(&b, impl);
+
+	bool progress = false;
+	nir_foreach_block_safe(block, impl) {
+		progress |= lower_sampler_io_block(block, &b, mem_ctx);
+	}
+
+	if (progress) {
+		nir_metadata_preserve(impl, nir_metadata_block_index |
+									nir_metadata_dominance);
+	}
+
+	return progress;
+}
+
+bool
+ir3_nir_lower_sampler_io(nir_shader *shader)
+{
+	bool progress = false;
+
+	nir_foreach_function(function, shader) {
+		if (function->impl)
+			progress |= lower_sampler_io_func(function->impl);
+	}
+
+	return progress;
+}
-- 
2.20.1