[Mesa-dev] [PATCH 5/9] ir3/nir: Add a new pass 'ir3_nir_lower_io_offsets'
Eduardo Lima Mitev
elima at igalia.com
Tue Feb 26 13:35:04 UTC 2019
On 2/25/19 6:54 PM, Rob Clark wrote:
> On Wed, Feb 13, 2019 at 4:30 PM Eduardo Lima Mitev <elima at igalia.com> wrote:
>>
>> This pass moves to NIR some offset computations that are currently
>> implemented on the IR3 backend compiler, to allow NIR to possibly
>> optimize them.
>>
>> For now, it only supports lowering byte-offset computation for image
>> store and atomics.
>> ---
>> src/freedreno/Makefile.sources | 1 +
>> src/freedreno/ir3/ir3_nir.h | 1 +
>> src/freedreno/ir3/ir3_nir_lower_io_offsets.c | 334 +++++++++++++++++++
>> 3 files changed, 336 insertions(+)
>> create mode 100644 src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>>
>> diff --git a/src/freedreno/Makefile.sources b/src/freedreno/Makefile.sources
>> index 7fea9de39ef..235fec1c4f2 100644
>> --- a/src/freedreno/Makefile.sources
>> +++ b/src/freedreno/Makefile.sources
>> @@ -31,6 +31,7 @@ ir3_SOURCES := \
>> ir3/ir3_legalize.c \
>> ir3/ir3_nir.c \
>> ir3/ir3_nir.h \
>> + ir3/ir3_nir_lower_io_offsets.c \
>> ir3/ir3_nir_lower_tg4_to_tex.c \
>> ir3/ir3_print.c \
>> ir3/ir3_ra.c \
>> diff --git a/src/freedreno/ir3/ir3_nir.h b/src/freedreno/ir3/ir3_nir.h
>> index 74201d34160..7983b74af2c 100644
>> --- a/src/freedreno/ir3/ir3_nir.h
>> +++ b/src/freedreno/ir3/ir3_nir.h
>> @@ -36,6 +36,7 @@ void ir3_nir_scan_driver_consts(nir_shader *shader, struct ir3_driver_const_layo
>>
>> bool ir3_nir_apply_trig_workarounds(nir_shader *shader);
>> bool ir3_nir_lower_tg4_to_tex(nir_shader *shader);
>> +bool ir3_nir_lower_io_offsets(nir_shader *shader);
>>
>> const nir_shader_compiler_options * ir3_get_compiler_options(struct ir3_compiler *compiler);
>> bool ir3_key_lowers_nir(const struct ir3_shader_key *key);
>> diff --git a/src/freedreno/ir3/ir3_nir_lower_io_offsets.c b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>> new file mode 100644
>> index 00000000000..a43b3895fd8
>> --- /dev/null
>> +++ b/src/freedreno/ir3/ir3_nir_lower_io_offsets.c
>> @@ -0,0 +1,334 @@
>> +/*
>> + * Copyright © 2018-2019 Igalia S.L.
>> + *
>> + * Permission is hereby granted, free of charge, to any person obtaining a
>> + * copy of this software and associated documentation files (the "Software"),
>> + * to deal in the Software without restriction, including without limitation
>> + * the rights to use, copy, modify, merge, publish, distribute, sublicense,
>> + * and/or sell copies of the Software, and to permit persons to whom the
>> + * Software is furnished to do so, subject to the following conditions:
>> + *
>> + * The above copyright notice and this permission notice (including the next
>> + * paragraph) shall be included in all copies or substantial portions of the
>> + * Software.
>> + *
>> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
>> + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
>> + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
>> + * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
>> + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
>> + * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
>> + * IN THE SOFTWARE.
>> + */
>> +
>> +#include "ir3_nir.h"
>> +#include "compiler/nir/nir_builder.h"
>> +
>> +/**
>> + * This pass moves to NIR certain offset computations for different I/O
>> + * ops that are currently implemented on the IR3 backend compiler, to
>> + * give NIR a chance to optimize them:
>> + *
>> + * - Byte-offset for image store and atomics: Emit instructions to
>> + * compute (x*bpp) + y*y_stride + z*z_stride), and place the resulting
>> + * SSA value in the 4th-component of the vec4 instruction that defines
>> + * the offset.
>> + */
>> +
>> +
>> +static bool
>> +intrinsic_is_image_atomic(unsigned intrinsic)
>> +{
>> + switch (intrinsic) {
>> + case nir_intrinsic_image_deref_atomic_add:
>> + case nir_intrinsic_image_deref_atomic_min:
>> + case nir_intrinsic_image_deref_atomic_max:
>> + case nir_intrinsic_image_deref_atomic_and:
>> + case nir_intrinsic_image_deref_atomic_or:
>> + case nir_intrinsic_image_deref_atomic_xor:
>> + case nir_intrinsic_image_deref_atomic_exchange:
>> + case nir_intrinsic_image_deref_atomic_comp_swap:
>> + return true;
>> + default:
>> + break;
>> + }
>> +
>> + return false;
>> +}
>> +
>> +static bool
>> +intrinsic_is_image_store_or_atomic(unsigned intrinsic)
>> +{
>> + if (intrinsic == nir_intrinsic_image_deref_store)
>> + return true;
>> + else
>> + return intrinsic_is_image_atomic(intrinsic);
>> +}
>> +
>> +/*
>> + * FIXME: shamelessly copied from ir3_compiler_nir until it gets factorized
>> + * out at some point.
>
> Sorry, I'd overlooked this patchset.. blame gitlab MR's
>
No prob. I should have started using MRs already :). Will do that from
now on.
> Anyways, the good news is these helpers were refactored out into
> ir3_image.c.. the bad news is rebasing this series will be a bit
> conflicty since I broke out the image/ssbo intrinsics into
> ir3_a6xx/ir3_a4xx. (Since they are different depending on the
> generation.. but offhand I think we can use the same nir->nir lowering
> and just ignore one of the nir_src's on a6xx.)
>
Ok, rebasing current master will need some work indeed :).
For the moment I have just prepared a v2 of the series that includes
only the SSBO lowering (roughly last 3 patches of this series). No MR
yet but you can see/test it here:
https://gitlab.freedesktop.org/elima/mesa/commits/fd/lower-io-offsets-v2
Why only SSBO? Because a) the computation of image byte-offset is not
helping much for now, and b) I suspect you have hit the SSBO dEQP tests
that fail on register allocation, and the SSBO part in this series help
with that, so I can capitalize on your attention :P.
I enabled the pass on both ir3_a4xx and ir3_a6xx, but have only tested
it on a5xx board. However, it should help a6xx too modulo any bug I
couldn't catch.
I will try to get the pass for SSBO merged first, then add the image
store/atomic and UBO patches I have as follow-ups.
Any feedback much appreciated. Thanks!
Eduardo
> BR,
> -R
>
>> + */
>> +static unsigned
>> +get_image_coords(const nir_variable *var)
>> +{
>> + const struct glsl_type *type = glsl_without_array(var->type);
>> + unsigned coords;
>> +
>> + switch (glsl_get_sampler_dim(type)) {
>> + case GLSL_SAMPLER_DIM_1D:
>> + case GLSL_SAMPLER_DIM_BUF:
>> + coords = 1;
>> + break;
>> + case GLSL_SAMPLER_DIM_2D:
>> + case GLSL_SAMPLER_DIM_RECT:
>> + case GLSL_SAMPLER_DIM_EXTERNAL:
>> + case GLSL_SAMPLER_DIM_MS:
>> + coords = 2;
>> + break;
>> + case GLSL_SAMPLER_DIM_3D:
>> + case GLSL_SAMPLER_DIM_CUBE:
>> + coords = 3;
>> + break;
>> + default:
>> + unreachable("bad sampler dim");
>> + return 0;
>> + }
>> +
>> + if (glsl_sampler_type_is_array(type)) {
>> + /* adjust # of coords to include array idx: */
>> + coords++;
>> + }
>> +
>> + return coords;
>> +}
>> +
>> +/* Returns the bytes-per-pixel for the different GL formats corresponding to
>> + * all supported image formats.
>> + */
>> +static unsigned
>> +bytes_per_pixel_for_gl_format(GLuint format)
>> +{
>> + switch (format) {
>> + case GL_R8I:
>> + case GL_R8UI:
>> + case GL_R8:
>> + case GL_R8_SNORM:
>> + return 1;
>> +
>> + case GL_R16F:
>> + case GL_R16I:
>> + case GL_R16UI:
>> + case GL_R16:
>> + case GL_R16_SNORM:
>> + case GL_RG8I:
>> + case GL_RG8UI:
>> + case GL_RG8:
>> + case GL_RG8_SNORM:
>> + return 2;
>> +
>> + case GL_R32F:
>> + case GL_R32I:
>> + case GL_R32UI:
>> + case GL_RG16F:
>> + case GL_RG16I:
>> + case GL_RG16UI:
>> + case GL_RG16:
>> + case GL_RG16_SNORM:
>> + case GL_RGBA8I:
>> + case GL_RGBA8UI:
>> + case GL_RGBA8:
>> + case GL_RGBA8_SNORM:
>> + case GL_RGB10_A2UI:
>> + case GL_RGB10_A2:
>> + case GL_R11F_G11F_B10F:
>> + return 4;
>> +
>> + case GL_RG32F:
>> + case GL_RG32I:
>> + case GL_RG32UI:
>> + case GL_RGBA16F:
>> + case GL_RGBA16I:
>> + case GL_RGBA16UI:
>> + case GL_RGBA16:
>> + case GL_RGBA16_SNORM:
>> + return 8;
>> +
>> + case GL_RGBA32F:
>> + case GL_RGBA32I:
>> + case GL_RGBA32UI:
>> + return 16;
>> +
>> + default:
>> + debug_assert(!"Unhandled GL format");
>> + }
>> +
>> + return 0;
>> +}
>> +
>> +static nir_ssa_def *
>> +insert_load_image_param(nir_builder *b, nir_ssa_def *image_deref,
>> + unsigned dimension)
>> +{
>> + nir_intrinsic_instr *load =
>> + nir_intrinsic_instr_create(b->shader,
>> + nir_intrinsic_image_deref_load_param_ir3);
>> + load->num_components = 1;
>> + load->const_index[0] = dimension;
>> + load->src[0] = nir_src_for_ssa(image_deref);
>> + nir_ssa_dest_init(&load->instr, &load->dest, 1, 32, NULL);
>> +
>> + nir_builder_instr_insert(b, &load->instr);
>> +
>> + return &load->dest.ssa;
>> +}
>> +
>> +static bool
>> +lower_offset_for_image_store_or_atomic(nir_intrinsic_instr *intrinsic,
>> + const nir_variable *var, nir_builder *b,
>> + void *mem_ctx)
>> +{
>> + nir_ssa_def *image_deref;
>> +
>> + /* Find the instruction that defines the coord source of the image
>> + * store/atomic intrinsic. It must be a "vec4" ALU instruction.
>> + */
>> + debug_assert(intrinsic->src[1].is_ssa);
>> + nir_ssa_def *offset_src_def = intrinsic->src[1].ssa;
>> +
>> + nir_instr *offset_parent_instr = offset_src_def->parent_instr;
>> + debug_assert(offset_parent_instr->type == nir_instr_type_alu);
>> +
>> + nir_alu_instr *vec4_instr = nir_instr_as_alu(offset_parent_instr);
>> + debug_assert(vec4_instr->op == nir_op_vec4);
>> +
>> + unsigned coords = get_image_coords(var);
>> +
>> + b->cursor = nir_before_instr(&vec4_instr->instr);
>> +
>> + /* These are actually offsets into image_dims register file (for
>> + * a given image).
>> + */
>> + enum {
>> + BYTES_PER_PIXEL = 0,
>> + Y_STRIDE = 1,
>> + Z_STRIDE = 2
>> + };
>> +
>> + /* x_offset = coord.x * bytes_per_pixel */
>> + nir_ssa_def *x_coord = vec4_instr->src[0].src.ssa;
>> + unsigned bpp = bytes_per_pixel_for_gl_format(var->data.image.format);
>> + nir_ssa_def *offset = nir_imul_imm(b, x_coord, bpp);
>> + nir_alu_instr *imul = nir_instr_as_alu(offset->parent_instr);
>> + debug_assert(imul);
>> + imul->src[0].swizzle[0] = vec4_instr->src[0].swizzle[0];
>> + debug_assert(offset);
>> +
>> + /* For Y and Z dimensions, we emit a temporary image_deref_load_param
>> + * intrinsic, to be consumed by ir3_compiler_nir::emit_intrinsic(), which
>> + * will just emit an uniform with the right value from image_dims[].
>> + */
>> +
>> + if (coords > 1) {
>> + /* src0 of the intrinsic is the dereferenced image. */
>> + debug_assert(intrinsic->src[0].is_ssa);
>> + image_deref = intrinsic->src[0].ssa;
>> +
>> + nir_ssa_def *y_coord = vec4_instr->src[1].src.ssa;
>> + nir_ssa_def *y_stride =
>> + insert_load_image_param(b, image_deref, Y_STRIDE);
>> +
>> + /* y_offset = coord.y * y_stride + x_offset */
>> + offset = nir_imad24_ir3(b, y_stride, y_coord, offset);
>> + debug_assert(offset);
>> + nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
>> + debug_assert(imad);
>> + imad->src[1].swizzle[0] = vec4_instr->src[1].swizzle[0];
>> +
>> + if (coords > 2) {
>> + nir_ssa_def *z_coord = vec4_instr->src[2].src.ssa;
>> + nir_ssa_def *z_stride =
>> + insert_load_image_param(b, image_deref, Z_STRIDE);
>> +
>> + /* z_offset = coord.z * z_stride + y_offset */
>> + offset = nir_imad24_ir3(b, z_stride, z_coord, offset);
>> + debug_assert(offset);
>> + nir_alu_instr *imad = nir_instr_as_alu(offset->parent_instr);
>> + debug_assert(imad);
>> + imad->src[1].swizzle[0] = vec4_instr->src[2].swizzle[0];
>> + }
>> + }
>> +
>> + if (intrinsic_is_image_atomic(intrinsic->intrinsic)) {
>> + /* Some cases, like atomics, seem to use dword offset instead
>> + * of byte offsets.. blob just puts an extra shr.b in there
>> + * in those cases:
>> + */
>> + nir_ssa_def *two = nir_imm_int(b, 2);
>> + offset = nir_ushr(b, offset, two);
>> + }
>> +
>> + /* Finally, store the computed offset in the 4th component of the
>> + * vec4 instruction, which is the only one guaranteed not to be used.
>> + * We don't want to override the original coordinate because it is
>> + * still needed in the backend.
>> + */
>> + nir_instr_rewrite_src(&vec4_instr->instr,
>> + &vec4_instr->src[3].src,
>> + nir_src_for_ssa(offset));
>> + return true;
>> +}
>> +
>> +static bool
>> +lower_io_offsets_block(nir_block *block, nir_builder *b, void *mem_ctx)
>> +{
>> + bool progress = false;
>> +
>> + nir_foreach_instr_safe(instr, block) {
>> + if (instr->type != nir_instr_type_intrinsic)
>> + continue;
>> +
>> + nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr);
>> + if (!intrinsic_is_image_store_or_atomic(intr->intrinsic))
>> + continue;
>> +
>> + const nir_variable *var = nir_intrinsic_get_var(intr, 0);
>> + progress |= lower_offset_for_image_store_or_atomic(intr, var, b,
>> + mem_ctx);
>> + }
>> +
>> + return progress;
>> +}
>> +
>> +static bool
>> +lower_io_offsets_func(nir_function_impl *impl)
>> +{
>> + void *mem_ctx = ralloc_parent(impl);
>> + nir_builder b;
>> + nir_builder_init(&b, impl);
>> +
>> + bool progress = false;
>> + nir_foreach_block_safe(block, impl) {
>> + progress |= lower_io_offsets_block(block, &b, mem_ctx);
>> + }
>> +
>> + if (progress) {
>> + nir_metadata_preserve(impl, nir_metadata_block_index |
>> + nir_metadata_dominance);
>> + }
>> +
>> + return progress;
>> +}
>> +
>> +bool
>> +ir3_nir_lower_io_offsets(nir_shader *shader)
>> +{
>> + bool progress = false;
>> +
>> + nir_foreach_function(function, shader) {
>> + if (function->impl)
>> + progress |= lower_io_offsets_func(function->impl);
>> + }
>> +
>> + return progress;
>> +}
>> --
>> 2.20.1
>>
>> _______________________________________________
>> mesa-dev mailing list
>> mesa-dev at lists.freedesktop.org
>> https://lists.freedesktop.org/mailman/listinfo/mesa-dev
>
More information about the mesa-dev
mailing list