[Mesa-dev] [PATCH 5/9] ir3/nir: Add a new pass 'ir3_nir_lower_io_offsets'

Tue Feb 26 13:35:04 UTC 2019

On 2/25/19 6:54 PM, Rob Clark wrote:
> On Wed, Feb 13, 2019 at 4:30 PM Eduardo Lima Mitev <elima at igalia.com> wrote:
>>
>> This pass moves to NIR some offset computations that are currently
>> implemented on the IR3 backend compiler, to allow NIR to possibly
>> optimize them.
>>
>> For now, it only supports lowering byte-offset computation for image
>> store and atomics.
>> ---
>>  src/freedreno/Makefile.sources               |   1 +
>>  src/freedreno/ir3/ir3_nir.h                  |   1 +
>>  src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 334 +++++++++++++++++++
>>  3 files changed, 336 insertions(+)
>>  create mode 100644 src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>>
>> diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
>> index 7fea9de39ef..235fec1c4f2 100644
>> --- a/src/freedreno/Makefile.sources
>> +++ b/src/freedreno/Makefile.sources
>> @@ -31,6 +31,7 @@ ir3_SOURCES := \
>>         ir3/ir3_legalize.c \
>>         ir3/ir3_nir.c \
>>         ir3/ir3_nir.h \
>> +       ir3/ir3_nir_lower_io_offsets.c \
>>         ir3/ir3_nir_lower_tg4_to_tex.c \
>>         ir3/ir3_print.c \
>>         ir3/ir3_ra.c \
>> diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
>> index 74201d34160..7983b74af2c 100644
>> --- a/src/freedreno/ir3/ir3_nir.h
>> +++ b/src/freedreno/ir3/ir3_nir.h
>> @@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo
>>
>>  bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
>>  bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
>> +bool ir3_nir_lower_io_offsets(nir_shader *shader);
>>
>>  const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
>>  bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
>> diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>> new file mode 100644
>> index 00000000000..a43b3895fd8
>> --- /dev/null
>> +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>> @@ -0,0 +1,334 @@
>> +/*
>> + * Copyright © 2018-2019 Igalia S.L.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + */
>> +
>> +#include "ir3_nir.h"
>> +#include "compiler/nir/nir_builder.h"
>> +
>> +/**
>> + * This pass moves to NIR certain offset computations for different I/O
>> + * ops that are currently implemented on the IR3 backend compiler, to
>> + * give NIR a chance to optimize them:
>> + *
>> + * - Byte-offset for image store and atomics: Emit instructions to
>> + *   compute (x*bpp) + y*y_stride + z*z_stride), and place the resulting
>> + *   SSA value in the 4th-component of the vec4 instruction that defines
>> + *   the offset.
>> + */
>> +
>> +
>> +static bool
>> +intrinsic_is_image_atomic(unsigned intrinsic)
>> +{
>> +       switch (intrinsic) {
>> +       case nir_intrinsic_image_deref_atomic_add:
>> +       case nir_intrinsic_image_deref_atomic_min:
>> +       case nir_intrinsic_image_deref_atomic_max:
>> +       case nir_intrinsic_image_deref_atomic_and:
>> +       case nir_intrinsic_image_deref_atomic_or:
>> +       case nir_intrinsic_image_deref_atomic_xor:
>> +       case nir_intrinsic_image_deref_atomic_exchange:
>> +       case nir_intrinsic_image_deref_atomic_comp_swap:
>> +               return true;
>> +       default:
>> +               break;
>> +       }
>> +
>> +       return false;
>> +}
>> +
>> +static bool
>> +intrinsic_is_image_store_or_atomic(unsigned intrinsic)
>> +{
>> +       if (intrinsic == nir_intrinsic_image_deref_store)
>> +               return true;
>> +       else
>> +               return intrinsic_is_image_atomic(intrinsic);
>> +}
>> +
>> +/*
>> + * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
>> + * out at some point.
> 
> Sorry, I'd overlooked this patchset.. blame gitlab MR's
> 

No prob. I should have started using MRs already :). Will do that from
now on.

> Anyways, the good news is these helpers were refactored out into
> ir3_image.c.. the bad news is rebasing this series will be a bit
> conflicty since I broke out the image/ssbo intrinsics into
> ir3_a6xx/ir3_a4xx.  (Since they are different depending on the
> generation.. but offhand I think we can use the same nir->nir lowering
> and just ignore one of the nir_src's on a6xx.)
> 

Ok, rebasing current master will need some work indeed :).

For the moment I have just prepared a v2 of the series that includes
only the SSBO lowering (roughly last 3 patches of this series). No MR
yet but you can see/test it here:
https://gitlab.freedesktop.org/elima/mesa/commits/fd/lower-io-offsets-v2

Why only SSBO? Because a) the computation of image byte-offset is not
helping much for now, and b) I suspect you have hit the SSBO dEQP tests
that fail on register allocation, and the SSBO part in this series help
with that, so I can capitalize on your attention :P.

I enabled the pass on both ir3_a4xx and ir3_a6xx, but have only tested
it on a5xx board. However, it should help a6xx too modulo any bug I
couldn't catch.

I will try to get the pass for SSBO merged first, then add the image
store/atomic and UBO patches I have as follow-ups.

Any feedback much appreciated. Thanks!

Eduardo

> BR,
> -R
> 
>> + */
>> +static unsigned
>> +get_image_coords(const nir_variable *var)
>> +{
>> +       const struct glsl_type *type = glsl_without_array(var->type);
>> +       unsigned coords;
>> +
>> +       switch (glsl_get_sampler_dim(type)) {
>> +       case GLSL_SAMPLER_DIM_1D:
>> +       case GLSL_SAMPLER_DIM_BUF:
>> +               coords = 1;
>> +               break;
>> +       case GLSL_SAMPLER_DIM_2D:
>> +       case GLSL_SAMPLER_DIM_RECT:
>> +       case GLSL_SAMPLER_DIM_EXTERNAL:
>> +       case GLSL_SAMPLER_DIM_MS:
>> +               coords = 2;
>> +               break;
>> +       case GLSL_SAMPLER_DIM_3D:
>> +       case GLSL_SAMPLER_DIM_CUBE:
>> +               coords = 3;
>> +               break;
>> +       default:
>> +               unreachable("bad sampler dim");
>> +               return 0;
>> +       }
>> +
>> +       if (glsl_sampler_type_is_array(type)) {
>> +               /* adjust # of coords to include array idx: */
>> +               coords++;
>> +       }
>> +
>> +       return coords;
>> +}
>> +
>> +/* Returns the bytes-per-pixel for the different GL formats corresponding to
>> + * all supported image formats.
>> + */
>> +static unsigned
>> +bytes_per_pixel_for_gl_format(GLuint format)
>> +{
>> +       switch (format) {
>> +       case GL_R8I:
>> +       case GL_R8UI:
>> +       case GL_R8:
>> +       case GL_R8_SNORM:
>> +               return 1;
>> +
>> +       case GL_R16F:
>> +       case GL_R16I:
>> +       case GL_R16UI:
>> +       case GL_R16:
>> +       case GL_R16_SNORM:
>> +       case GL_RG8I:
>> +       case GL_RG8UI:
>> +       case GL_RG8:
>> +       case GL_RG8_SNORM:
>> +               return 2;
>> +
>> +       case GL_R32F:
>> +       case GL_R32I:
>> +       case GL_R32UI:
>> +       case GL_RG16F:
>> +       case GL_RG16I:
>> +       case GL_RG16UI:
>> +       case GL_RG16:
>> +       case GL_RG16_SNORM:
>> +       case GL_RGBA8I:
>> +       case GL_RGBA8UI:
>> +       case GL_RGBA8:
>> +       case GL_RGBA8_SNORM:
>> +       case GL_RGB10_A2UI:
>> +       case GL_RGB10_A2:
>> +       case GL_R11F_G11F_B10F:
>> +               return 4;
>> +
>> +       case GL_RG32F:
>> +       case GL_RG32I:
>> +       case GL_RG32UI:
>> +       case GL_RGBA16F:
>> +       case GL_RGBA16I:
>> +       case GL_RGBA16UI:
>> +       case GL_RGBA16:
>> +       case GL_RGBA16_SNORM:
>> +               return 8;
>> +
>> +       case GL_RGBA32F:
>> +       case GL_RGBA32I:
>> +       case GL_RGBA32UI:
>> +               return 16;
>> +
>> +       default:
>> +               debug_assert(!"Unhandled GL format");
>> +       }
>> +
>> +       return 0;
>> +}
>> +
>> +static nir_ssa_def *
>> +insert_load_image_param(nir_builder *b, nir_ssa_def *image_deref,
>> +                                               unsigned dimension)
>> +{
>> +       nir_intrinsic_instr *load =
>> +               nir_intrinsic_instr_create(b->shader,
>> +                                                                  nir_intrinsic_image_deref_load_param_ir3);
>> +       load->num_components = 1;
>> +       load->const_index[0] = dimension;
>> +       load->src[0] = nir_src_for_ssa(image_deref);
>> +       nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
>> +
>> +       nir_builder_instr_insert(b, &load->instr);
>> +
>> +       return &load->dest.ssa;
>> +}
>> +
>> +static bool
>> +lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
>> +                                                                          const nir_variable *var, nir_builder *b,
>> +                                                                          void *mem_ctx)
>> +{
>> +       nir_ssa_def *image_deref;
>> +
>> +       /* Find the instruction that defines the coord source of the image
>> +        * store/atomic intrinsic. It must be a "vec4" ALU instruction.
>> +        */
>> +       debug_assert(intrinsic->src[1].is_ssa);
>> +       nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
>> +
>> +       nir_instr *offset_parent_instr = offset_src_def->parent_instr;
>> +       debug_assert(offset_parent_instr->type == nir_instr_type_alu);
>> +
>> +       nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
>> +       debug_assert(vec4_instr->op == nir_op_vec4);
>> +
>> +       unsigned coords = get_image_coords(var);
>> +
>> +       b->cursor = nir_before_instr(&vec4_instr->instr);
>> +
>> +       /* These are actually offsets into image_dims register file (for
>> +        * a given image).
>> +        */
>> +       enum {
>> +               BYTES_PER_PIXEL = 0,
>> +               Y_STRIDE        = 1,
>> +               Z_STRIDE        = 2
>> +       };
>> +
>> +       /* x_offset = coord.x * bytes_per_pixel */
>> +       nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
>> +       unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
>> +       nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
>> +       nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
>> +       debug_assert(imul);
>> +       imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
>> +       debug_assert(offset);
>> +
>> +       /* For Y and Z dimensions, we emit a temporary image_deref_load_param
>> +        * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which
>> +        * will just emit an uniform with the right value from image_dims[].
>> +        */
>> +
>> +       if (coords > 1) {
>> +               /* src0 of the intrinsic is the dereferenced image. */
>> +               debug_assert(intrinsic->src[0].is_ssa);
>> +               image_deref = intrinsic->src[0].ssa;
>> +
>> +               nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
>> +               nir_ssa_def *y_stride =
>> +                       insert_load_image_param(b, image_deref, Y_STRIDE);
>> +
>> +               /* y_offset = coord.y * y_stride + x_offset */
>> +               offset = nir_imad24_ir3(b, y_stride, y_coord, offset);
>> +               debug_assert(offset);
>> +               nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
>> +               debug_assert(imad);
>> +               imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
>> +
>> +               if (coords > 2) {
>> +                       nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
>> +                       nir_ssa_def *z_stride =
>> +                               insert_load_image_param(b, image_deref, Z_STRIDE);
>> +
>> +                       /* z_offset = coord.z * z_stride + y_offset */
>> +                       offset = nir_imad24_ir3(b, z_stride, z_coord, offset);
>> +                       debug_assert(offset);
>> +                       nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
>> +                       debug_assert(imad);
>> +                       imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
>> +               }
>> +       }
>> +
>> +       if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
>> +               /* Some cases, like atomics, seem to use dword offset instead
>> +                * of byte offsets.. blob just puts an extra shr.b in there
>> +                * in those cases:
>> +                */
>> +               nir_ssa_def *two = nir_imm_int(b, 2);
>> +               offset = nir_ushr(b, offset, two);
>> +       }
>> +
>> +       /* Finally, store the computed offset in the 4th component of the
>> +        * vec4 instruction, which is the only one guaranteed not to be used.
>> +        * We don't want to override the original coordinate because it is
>> +        * still needed in the backend.
>> +        */
>> +       nir_instr_rewrite_src(&vec4_instr->instr,
>> +                                                 &vec4_instr->src[3].src,
>> +                                                 nir_src_for_ssa(offset));
>> +       return true;
>> +}
>> +
>> +static bool
>> +lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
>> +{
>> +       bool progress = false;
>> +
>> +       nir_foreach_instr_safe(instr, block) {
>> +               if (instr->type != nir_instr_type_intrinsic)
>> +                       continue;
>> +
>> +               nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
>> +               if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
>> +                       continue;
>> +
>> +               const nir_variable *var = nir_intrinsic_get_var(intr, 0);
>> +               progress |= lower_offset_for_image_store_or_atomic(intr, var, b,
>> +                                                                                                                  mem_ctx);
>> +       }
>> +
>> +       return progress;
>> +}
>> +
>> +static bool
>> +lower_io_offsets_func(nir_function_impl *impl)
>> +{
>> +       void *mem_ctx = ralloc_parent(impl);
>> +       nir_builder b;
>> +       nir_builder_init(&b, impl);
>> +
>> +       bool progress = false;
>> +       nir_foreach_block_safe(block, impl) {
>> +               progress |= lower_io_offsets_block(block, &b, mem_ctx);
>> +       }
>> +
>> +       if (progress) {
>> +               nir_metadata_preserve(impl, nir_metadata_block_index |
>> +                                                                       nir_metadata_dominance);
>> +       }
>> +
>> +       return progress;
>> +}
>> +
>> +bool
>> +ir3_nir_lower_io_offsets(nir_shader *shader)
>> +{
>> +       bool progress = false;
>> +
>> +       nir_foreach_function(function, shader) {
>> +               if (function->impl)
>> +                       progress |= lower_io_offsets_func(function->impl);
>> +       }
>> +
>> +       return progress;
>> +}
>> --
>> 2.20.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>