[Freedreno] [RFC 4/4] ir3: Use ir3_nir_lower_sampler_io pass

Fri Jan 25 15:48:33 UTC 2019

This effectively removes all offset calculations in
ir3_compiler_nir::get_image_offset().

No regressions observed on affected tests from Khronos CTS and piglit
suites, compared to master.

Collecting useful stats on helps/hurts caused by this pass is WIP. Very
few shaders in shader-db data-base exercise image store or image
atomic ops, and of those that do, most require higher versions of
GLSL than what freedreno supports, so they get skipped.

There is on-going work writing/porting shaders to collect useful
stats. So far, all tested show no meaningful difference compared
to master.
---
 src/freedreno/ir3/ir3_compiler_nir.c | 61 +++++++++++++---------------
 src/freedreno/ir3/ir3_nir.c          |  1 +
 2 files changed, 29 insertions(+), 33 deletions(-)

diff --git a/src/freedreno/ir3/ir3_compiler_nir.c b/src/freedreno/ir3/ir3_compiler_nir.c
index fd641735620..fe329db658c 100644
--- a/src/freedreno/ir3/ir3_compiler_nir.c
+++ b/src/freedreno/ir3/ir3_compiler_nir.c
@@ -548,6 +548,9 @@ emit_alu(struct ir3_context *ctx, nir_alu_instr *alu)
 					ir3_MADSH_M16(b, src[0], 0, src[1], 0,
 						ir3_MULL_U(b, src[0], 0, src[1], 0), 0), 0);
 		break;
+	case nir_op_imad:
+		dst[0] = ir3_MAD_S24(b, src[0], 0, src[1], 0, src[2], 0);
+		break;
 	case nir_op_ineg:
 		dst[0] = ir3_ABSNEG_S(b, src[0], IR3_REG_SNEG);
 		break;
@@ -1172,44 +1175,19 @@ get_image_type(const nir_variable *var)
 
 static struct ir3_instruction *
 get_image_offset(struct ir3_context *ctx, const nir_variable *var,
-		struct ir3_instruction * const *coords, bool byteoff)
+		struct ir3_instruction * const *coords)
 {
 	struct ir3_block *b = ctx->block;
-	struct ir3_instruction *offset;
-	unsigned ncoords = get_image_coords(var, NULL);
-
-	/* to calculate the byte offset (yes, uggg) we need (up to) three
-	 * const values to know the bytes per pixel, and y and z stride:
-	 */
-	unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
-		ctx->so->const_layout.image_dims.off[var->data.driver_location];
 
 	debug_assert(ctx->so->const_layout.image_dims.mask &
 			(1 << var->data.driver_location));
 
-	/* offset = coords.x * bytes_per_pixel: */
-	offset = ir3_MUL_S(b, coords[0], 0, create_uniform(b, cb + 0), 0);
-	if (ncoords > 1) {
-		/* offset += coords.y * y_pitch: */
-		offset = ir3_MAD_S24(b, create_uniform(b, cb + 1), 0,
-				coords[1], 0, offset, 0);
-	}
-	if (ncoords > 2) {
-		/* offset += coords.z * z_pitch: */
-		offset = ir3_MAD_S24(b, create_uniform(b, cb + 2), 0,
-				coords[2], 0, offset, 0);
-	}
-
-	if (!byteoff) {
-		/* Some cases, like atomics, seem to use dword offset instead
-		 * of byte offsets.. blob just puts an extra shr.b in there
-		 * in those cases:
-		 */
-		offset = ir3_SHR_B(b, offset, 0, create_immed(b, 2), 0);
-	}
-
+	/* ir3_nir_lower_sampler_io pass should have placed the final
+	 * byte-offset (or dword offset for atomics) at the 4th component
+	 * of the coordinate vector.
+	 */
 	return ir3_create_collect(ctx, (struct ir3_instruction*[]){
-		offset,
+		coords[3],
 		create_immed(b, 0),
 	}, 2);
 }
@@ -1341,7 +1319,7 @@ emit_intrinsic_store_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	 * src2 is 64b byte offset
 	 */
 
-	offset = get_image_offset(ctx, var, coords, true);
+	offset = get_image_offset(ctx, var, coords);
 
 	/* NOTE: stib seems to take byte offset, but stgb.typed can be used
 	 * too and takes a dword offset.. not quite sure yet why blob uses
@@ -1443,7 +1421,7 @@ emit_intrinsic_atomic_image(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	 */
 	src0 = ir3_get_src(ctx, &intr->src[3])[0];
 	src1 = ir3_create_collect(ctx, coords, ncoords);
-	src2 = get_image_offset(ctx, var, coords, false);
+	src2 = get_image_offset(ctx, var, coords);
 
 	switch (intr->intrinsic) {
 	case nir_intrinsic_image_deref_atomic_add:
@@ -1612,6 +1590,23 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
 	}
 
 	switch (intr->intrinsic) {
+	case nir_intrinsic_load_image_stride: {
+		idx = intr->const_index[0];
+
+		/* this is the index into image_dims offsets, which can take
+		 * values 0, 1 or 2 (bpp, y-stride, z-stride respectively).
+		 */
+		uint8_t off = intr->const_index[1];
+		debug_assert(off <= 2);
+
+		unsigned cb = regid(ctx->so->constbase.image_dims, 0) +
+			ctx->so->const_layout.image_dims.off[idx];
+		debug_assert(ctx->so->const_layout.image_dims.mask & (1 << idx));
+
+		dst[0] = create_uniform(b, cb + off);
+		break;
+	}
+
 	case nir_intrinsic_load_uniform:
 		idx = nir_intrinsic_base(intr);
 		const_offset = nir_src_as_const_value(intr->src[0]);
diff --git a/src/freedreno/ir3/ir3_nir.c b/src/freedreno/ir3/ir3_nir.c
index d9fcf798b3d..68a0edb343c 100644
--- a/src/freedreno/ir3/ir3_nir.c
+++ b/src/freedreno/ir3/ir3_nir.c
@@ -160,6 +160,7 @@ ir3_optimize_nir(struct ir3_shader *shader, nir_shader *s,
 
 	OPT_V(s, nir_opt_global_to_local);
 	OPT_V(s, nir_lower_regs_to_ssa);
+	OPT_V(s, ir3_nir_lower_sampler_io);
 
 	if (key) {
 		if (s->info.stage == MESA_SHADER_VERTEX) {
-- 
2.20.1