[Mesa-dev] [PATCH] glsl: provide the option of using BFE for unpack builting lowering
Ilia Mirkin
imirkin at alum.mit.edu
Thu Aug 20 19:19:06 PDT 2015
On Thu, Aug 20, 2015 at 9:58 PM, Ilia Mirkin <imirkin at alum.mit.edu> wrote:
> This greatly improves generated code, especially for the snorm variants,
> since it is able to get rid of the lshift/rshift for sext, as well as
> replacing each shift + mask with a single op.
>
> Signed-off-by: Ilia Mirkin <imirkin at alum.mit.edu>
> ---
>
> This actually slightly pessimizes nvc0 generated code for the unorm
> case because it is no longer able to detect that it can instruct cvt
> to read a byte directly, but that will be fixed in the backend later.
>
> src/glsl/ir_builder.cpp | 6 ++
> src/glsl/ir_builder.h | 1 +
> src/glsl/ir_optimization.h | 1 +
> src/glsl/lower_packing_builtins.cpp | 112 ++++++++++++++++++++++++-----
> src/mesa/state_tracker/st_glsl_to_tgsi.cpp | 3 +-
> 5 files changed, 105 insertions(+), 18 deletions(-)
>
> diff --git a/src/glsl/ir_builder.cpp b/src/glsl/ir_builder.cpp
> index cd03859..c9cf124 100644
> --- a/src/glsl/ir_builder.cpp
> +++ b/src/glsl/ir_builder.cpp
> @@ -567,6 +567,12 @@ csel(operand a, operand b, operand c)
> }
>
> ir_expression *
> +bitfield_extract(operand a, operand b, operand c)
> +{
> + return expr(ir_triop_bitfield_extract, a, b, c);
> +}
> +
> +ir_expression *
> bitfield_insert(operand a, operand b, operand c, operand d)
> {
> void *mem_ctx = ralloc_parent(a.val);
> diff --git a/src/glsl/ir_builder.h b/src/glsl/ir_builder.h
> index f76453f..b483ebf 100644
> --- a/src/glsl/ir_builder.h
> +++ b/src/glsl/ir_builder.h
> @@ -200,6 +200,7 @@ ir_expression *interpolate_at_sample(operand a, operand b);
> ir_expression *fma(operand a, operand b, operand c);
> ir_expression *lrp(operand x, operand y, operand a);
> ir_expression *csel(operand a, operand b, operand c);
> +ir_expression *bitfield_extract(operand a, operand b, operand c);
> ir_expression *bitfield_insert(operand a, operand b, operand c, operand d);
>
> ir_swizzle *swizzle(operand a, int swizzle, int components);
> diff --git a/src/glsl/ir_optimization.h b/src/glsl/ir_optimization.h
> index b955874..265b223 100644
> --- a/src/glsl/ir_optimization.h
> +++ b/src/glsl/ir_optimization.h
> @@ -69,6 +69,7 @@ enum lower_packing_builtins_op {
> LOWER_UNPACK_UNORM_4x8 = 0x0800,
>
> LOWER_PACK_USE_BFI = 0x1000,
> + LOWER_PACK_USE_BFE = 0x2000,
> };
>
> bool do_common_optimization(exec_list *ir, bool linked,
> diff --git a/src/glsl/lower_packing_builtins.cpp b/src/glsl/lower_packing_builtins.cpp
> index be17e1d..e77e5d9 100644
> --- a/src/glsl/lower_packing_builtins.cpp
> +++ b/src/glsl/lower_packing_builtins.cpp
> @@ -118,6 +118,7 @@ public:
> *rvalue = split_unpack_half_2x16(op0);
> break;
> case LOWER_PACK_UNPACK_NONE:
> + default:
> assert(!"not reached");
> break;
> }
> @@ -307,6 +308,39 @@ private:
> }
>
> /**
> + * \brief Unpack a uint32 into two int16's.
> + *
> + * Specifically each 16-bit value is sign-extended to the full width of an
> + * int32 on return.
> + */
> + ir_rvalue *
> + unpack_uint_to_ivec2(ir_rvalue *uint_rval)
> + {
> + assert(uint_rval->type == glsl_type::uint_type);
> +
> + if (!(op_mask & LOWER_PACK_USE_BFE)) {
> + return rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
> + constant(16u)),
> + constant(16u));
> + }
> +
> + ir_variable *i = factory.make_temp(glsl_type::int_type,
> + "tmp_unpack_uint_to_ivec2_i");
> + factory.emit(assign(i, u2i(uint_rval)));
> +
> + /* ivec2 i2; */
> + ir_variable *i2 = factory.make_temp(glsl_type::ivec4_type,
> + "tmp_unpack_uint_to_ivec4_i2");
This should, of course, be an ivec2, as the comment indicates. Oops!
> +
> + factory.emit(assign(i2, bitfield_extract(i, constant(0), constant(16)),
> + WRITEMASK_X));
> + factory.emit(assign(i2, bitfield_extract(i, constant(16), constant(16)),
> + WRITEMASK_Y));
> +
> + return deref(i2).val;
> + }
> +
> + /**
> * \brief Unpack a uint32 into four uint8's.
> *
> * Interpret the given uint32 as a uint8 4-tuple where the uint32's least
> @@ -327,21 +361,69 @@ private:
> ir_variable *u4 = factory.make_temp(glsl_type::uvec4_type,
> "tmp_unpack_uint_to_uvec4_u4");
>
> - /* u4.x = u & 0xffu; */
> - factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
> + if (op_mask & LOWER_PACK_USE_BFE) {
> + factory.emit(assign(u4, bitfield_extract(u, constant(0), constant(8)),
> + WRITEMASK_X));
> + factory.emit(assign(u4, bitfield_extract(u, constant(8), constant(8)),
> + WRITEMASK_Y));
> + factory.emit(assign(u4, bitfield_extract(u, constant(16), constant(8)),
> + WRITEMASK_Z));
> + factory.emit(assign(u4, bitfield_extract(u, constant(24), constant(8)),
> + WRITEMASK_W));
Not sure if this is desirable. Perhaps only do it for the middle 2 ops
which are actually 2 ops and leave the first and last single-op ones
alone?
> + } else {
> + /* u4.x = u & 0xffu; */
> + factory.emit(assign(u4, bit_and(u, constant(0xffu)), WRITEMASK_X));
> +
> + /* u4.y = (u >> 8u) & 0xffu; */
> + factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
> + constant(0xffu)), WRITEMASK_Y));
> +
> + /* u4.z = (u >> 16u) & 0xffu; */
> + factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
> + constant(0xffu)), WRITEMASK_Z));
> +
> + /* u4.w = (u >> 24u) */
> + factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
> + }
>
> - /* u4.y = (u >> 8u) & 0xffu; */
> - factory.emit(assign(u4, bit_and(rshift(u, constant(8u)),
> - constant(0xffu)), WRITEMASK_Y));
> + return deref(u4).val;
> + }
>
> - /* u4.z = (u >> 16u) & 0xffu; */
> - factory.emit(assign(u4, bit_and(rshift(u, constant(16u)),
> - constant(0xffu)), WRITEMASK_Z));
> + /**
> + * \brief Unpack a uint32 into four int8's.
> + *
> + * Specifically each 8-bit value is sign-extended to the full width of an
> + * int32 on return.
> + */
> + ir_rvalue *
> + unpack_uint_to_ivec4(ir_rvalue *uint_rval)
> + {
> + assert(uint_rval->type == glsl_type::uint_type);
>
> - /* u4.w = (u >> 24u) */
> - factory.emit(assign(u4, rshift(u, constant(24u)), WRITEMASK_W));
> + if (!(op_mask & LOWER_PACK_USE_BFE)) {
> + return rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
> + constant(24u)),
> + constant(24u));
> + }
>
> - return deref(u4).val;
> + ir_variable *i = factory.make_temp(glsl_type::int_type,
> + "tmp_unpack_uint_to_ivec4_i");
> + factory.emit(assign(i, u2i(uint_rval)));
> +
> + /* ivec4 i4; */
> + ir_variable *i4 = factory.make_temp(glsl_type::ivec4_type,
> + "tmp_unpack_uint_to_ivec4_i4");
> +
> + factory.emit(assign(i4, bitfield_extract(i, constant(0), constant(8)),
> + WRITEMASK_X));
> + factory.emit(assign(i4, bitfield_extract(i, constant(8), constant(8)),
> + WRITEMASK_Y));
> + factory.emit(assign(i4, bitfield_extract(i, constant(16), constant(8)),
> + WRITEMASK_Z));
> + factory.emit(assign(i4, bitfield_extract(i, constant(24), constant(8)),
> + WRITEMASK_W));
> +
> + return deref(i4).val;
> }
>
> /**
> @@ -490,9 +572,7 @@ private:
> assert(uint_rval->type == glsl_type::uint_type);
>
> ir_rvalue *result =
> - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec2(uint_rval)),
> - constant(16)),
> - constant(16u))),
> + clamp(div(i2f(unpack_uint_to_ivec2(uint_rval)),
> constant(32767.0f)),
> constant(-1.0f),
> constant(1.0f));
> @@ -549,9 +629,7 @@ private:
> assert(uint_rval->type == glsl_type::uint_type);
>
> ir_rvalue *result =
> - clamp(div(i2f(rshift(lshift(u2i(unpack_uint_to_uvec4(uint_rval)),
> - constant(24u)),
> - constant(24u))),
> + clamp(div(i2f(unpack_uint_to_ivec4(uint_rval)),
> constant(127.0f)),
> constant(-1.0f),
> constant(1.0f));
> diff --git a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> index d3c7238..123d1b7 100644
> --- a/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> +++ b/src/mesa/state_tracker/st_glsl_to_tgsi.cpp
> @@ -5996,7 +5996,8 @@ st_link_shader(struct gl_context *ctx, struct gl_shader_program *prog)
> LOWER_UNPACK_HALF_2x16;
>
> if (ctx->Extensions.ARB_gpu_shader5)
> - lower_inst |= LOWER_PACK_USE_BFI;
> + lower_inst |= LOWER_PACK_USE_BFI |
> + LOWER_PACK_USE_BFE;
>
> lower_packing_builtins(ir, lower_inst);
> }
> --
> 2.4.6
>
More information about the mesa-dev
mailing list