[Mesa-dev] [PATCH 09/12] nir: Add lowering support for packing opcodes.

Thu Jan 28 00:32:15 PST 2016

On Mon, 2016-01-25 at 15:18 -0800, Matt Turner wrote:
> ---
>  src/glsl/nir/nir.h                     |  4 ++++
>  src/glsl/nir/nir_lower_alu_to_scalar.c | 32 ++++++++++++++++++++++++++++++++
>  src/glsl/nir/nir_opcodes.py            | 10 ++++++++++
>  src/glsl/nir/nir_opt_algebraic.py      | 20 ++++++++++++++++++++
>  4 files changed, 66 insertions(+)
> 
> diff --git a/src/glsl/nir/nir.h b/src/glsl/nir/nir.h
> index 7b39cbb..bbd5b1a 100644
> --- a/src/glsl/nir/nir.h
> +++ b/src/glsl/nir/nir.h
> @@ -1469,6 +1469,10 @@ typedef struct nir_shader_compiler_options {
>     bool lower_ffract;
>  
>     bool lower_pack_half_2x16;
> +   bool lower_pack_unorm_2x16;
> +   bool lower_pack_snorm_2x16;
> +   bool lower_pack_unorm_4x8;
> +   bool lower_pack_snorm_4x8;
>     bool lower_unpack_half_2x16;
>  
>     bool lower_extract_byte;
> diff --git a/src/glsl/nir/nir_lower_alu_to_scalar.c b/src/glsl/nir/nir_lower_alu_to_scalar.c
> index 5372fbe..37cb022 100644
> --- a/src/glsl/nir/nir_lower_alu_to_scalar.c
> +++ b/src/glsl/nir/nir_lower_alu_to_scalar.c
> @@ -134,6 +134,38 @@ lower_alu_instr_scalar(nir_alu_instr *instr, nir_builder *b)
>        return;
>     }
>  
> +   case nir_op_pack_uvec2_to_uint: {
> +      assert(b->shader->options->lower_pack_snorm_2x16 ||
> +             b->shader->options->lower_pack_unorm_2x16);

So we only want opt_algebraic to generate these opcodes? Why? I see that
this would be the case now, and it is true that there is probably no
reason for other IR -> NIR translators to inject these opcodes directly
without going through opt_algebraic, but do we need to prevent that with
an assert?

I am not against anyway, just curious.

> +      nir_ssa_def *word =
> +         nir_extract_uword(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
> +      nir_ssa_def *val =
> +         nir_ior(b, nir_ishl(b, nir_channel(b, word, 1), nir_imm_int(b, 16)),
> +                                nir_channel(b, word, 0));
> +
> +      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
> +      nir_instr_remove(&instr->instr);
> +      break;
> +   }
> +
> +   case nir_op_pack_uvec4_to_uint: {
> +      assert(b->shader->options->lower_pack_snorm_4x8 ||
> +             b->shader->options->lower_pack_unorm_4x8);
> +
> +      nir_ssa_def *byte =
> +         nir_extract_ubyte(b, instr->src[0].src.ssa, nir_imm_int(b, 0));
> +      nir_ssa_def *val =
> +         nir_ior(b, nir_ior(b, nir_ishl(b, nir_channel(b, byte, 3), nir_imm_int(b, 24)),
> +                               nir_ishl(b, nir_channel(b, byte, 2), nir_imm_int(b, 16))),
> +                    nir_ior(b, nir_ishl(b, nir_channel(b, byte, 1), nir_imm_int(b, 8)),
> +                               nir_channel(b, byte, 0)));
> +
> +      nir_ssa_def_rewrite_uses(&instr->dest.dest.ssa, nir_src_for_ssa(val));
> +      nir_instr_remove(&instr->instr);
> +      break;
> +   }
> +
>     case nir_op_fdph: {
>        nir_ssa_def *sum[4];
>        for (unsigned i = 0; i < 3; i++) {
> diff --git a/src/glsl/nir/nir_opcodes.py b/src/glsl/nir/nir_opcodes.py
> index be3cd17..3b82c3c 100644
> --- a/src/glsl/nir/nir_opcodes.py
> +++ b/src/glsl/nir/nir_opcodes.py
> @@ -237,6 +237,16 @@ unpack_2x16("unorm")
>  unpack_4x8("unorm")
>  unpack_2x16("half")
>  
> +unop_horiz("pack_uvec2_to_uint", 0, tuint, 2, tuint, """
> +dst = (src0.x & 0xffff) | (src0.y >> 16);
> +""")

This should've 1 as the output size instead of 0.

> +unop_horiz("pack_uvec4_to_uint", 0, tuint, 4, tuint, """
> +dst = (src0.x <<  0) |
> +      (src0.y <<  8) |
> +      (src0.z << 16) |
> +      (src0.w << 24);
> +""")

Same here.

>  # Lowered floating point unpacking operations.
>  
> diff --git a/src/glsl/nir/nir_opt_algebraic.py b/src/glsl/nir/nir_opt_algebraic.py
> index b761b54..56b0f5e 100644
> --- a/src/glsl/nir/nir_opt_algebraic.py
> +++ b/src/glsl/nir/nir_opt_algebraic.py
> @@ -258,6 +258,26 @@ optimizations = [
>     (('extract_uword', a, b),
>      ('iand', ('ushr', a, ('imul', b, 16)), 0xffff),
>      'options->lower_extract_word'),
> +
> +    (('pack_unorm_2x16', 'v'),
> +     ('pack_uvec2_to_uint',
> +        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 65535.0)))),
> +     'options->lower_pack_unorm_2x16'),
> +
> +    (('pack_unorm_4x8', 'v'),
> +     ('pack_uvec4_to_uint',
> +        ('f2u', ('fround_even', ('fmul', ('fsat', 'v'), 255.0)))),
> +     'options->lower_pack_unorm_4x8'),
> +
> +    (('pack_snorm_2x16', 'v'),
> +     ('pack_uvec2_to_uint',
> +        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 32767.0)))),
> +     'options->lower_pack_snorm_2x16'),
> +
> +    (('pack_snorm_4x8', 'v'),
> +     ('pack_uvec4_to_uint',
> +        ('f2i', ('fround_even', ('fmul', ('fmin', 1.0, ('fmax', -1.0, 'v')), 127.0)))),
> +     'options->lower_pack_snorm_4x8'),

I think the pack_snorm_* opcodes need a i2u conversion at the end.
That's what the GLSL IR lowering is doing and also what the spec [1]
seems to indicate:

    "The functions packUnorm2x16(), packSnorm2x16(), packUnorm4x8(), and
    packSnorm4x8() first convert each component of a two- or four-component
    vector of normalized floating-point values into 8- or 16-bit integer
    values.  Then, the results are packed into a 32-bit unsigned integer.
    The first component of the vector will be written to the least
    significant bits of the output; the last component will be written to
    the most significant bits."

[1]https://www.opengl.org/registry/specs/ARB/shading_language_packing.txt

>  ]
>  
>  # Add optimizations to handle the case where the result of a ternary is