[Mesa-dev] [PATCH 15/50] glsl: Add "built-in" functions to do sqrt(fp64)

Wed Mar 14 02:56:02 UTC 2018

Am 13.03.2018 um 05:24 schrieb Dave Airlie:
> From: Elie Tournier <tournier.elie at gmail.com>
> 
> This currently uses fp64->fp32, sqrt(fp32), fp32->fp64.
> 
> [airlied: The code is include from soft float for doing proper sqrt64
> but it needs to be decided if we need to pursue this and
> how to optimise it better.]
> 
> Signed-off-by: Elie Tournier <elie.tournier at collabora.com>
> ---
>  src/compiler/glsl/builtin_float64.h     | 393 ++++++++++++++++++++++++++++++++
>  src/compiler/glsl/builtin_functions.cpp |   4 +
>  src/compiler/glsl/builtin_functions.h   |   3 +
>  src/compiler/glsl/float64.glsl          | 275 ++++++++++++++++++++++
>  src/compiler/glsl/glcpp/glcpp-parse.y   |   1 +
>  5 files changed, 676 insertions(+)
> 
> diff --git a/src/compiler/glsl/builtin_float64.h b/src/compiler/glsl/builtin_float64.h
> index 034d2d0..6fbe12d 100644
> --- a/src/compiler/glsl/builtin_float64.h
> +++ b/src/compiler/glsl/builtin_float64.h
> @@ -6242,3 +6242,396 @@ fp32_to_fp64(void *mem_ctx, builtin_available_predicate avail)
>     sig->replace_parameters(&sig_parameters);
>     return sig;
>  }
> +ir_function_signature *
> +fsqrt64(void *mem_ctx, builtin_available_predicate avail)
> +{
> +   ir_function_signature *const sig =
> +      new(mem_ctx) ir_function_signature(glsl_type::uvec2_type, avail);
> +   ir_factory body(&sig->body, mem_ctx);
> +   sig->is_defined = true;
> +
> +   exec_list sig_parameters;
> +
> +   ir_variable *const r09A9 = new(mem_ctx) ir_variable(glsl_type::uvec2_type, "a", ir_var_function_in);
> +   sig_parameters.push_tail(r09A9);
> +   ir_variable *const r09AA = body.make_temp(glsl_type::uvec2_type, "a");
> +   body.emit(assign(r09AA, r09A9, 0x03));
> +
> +   ir_variable *const r09AB = body.make_temp(glsl_type::float_type, "return_value");
> +   ir_variable *const r09AC = body.make_temp(glsl_type::uint_type, "extractFloat64FracHi_retval");
> +   body.emit(assign(r09AC, bit_and(swizzle_y(r09A9), body.constant(1048575u)), 0x01));
> +
> +   ir_variable *const r09AD = body.make_temp(glsl_type::int_type, "extractFloat64Exp_retval");
> +   ir_expression *const r09AE = rshift(swizzle_y(r09A9), body.constant(int(20)));
> +   ir_expression *const r09AF = bit_and(r09AE, body.constant(2047u));
> +   body.emit(assign(r09AD, expr(ir_unop_u2i, r09AF), 0x01));
> +
> +   ir_variable *const r09B0 = body.make_temp(glsl_type::uint_type, "extractFloat64Sign_retval");
> +   body.emit(assign(r09B0, rshift(swizzle_y(r09A9), body.constant(int(31))), 0x01));
> +
> +   /* IF CONDITION */
> +   ir_expression *const r09B2 = equal(r09AD, body.constant(int(2047)));
> +   ir_if *f09B1 = new(mem_ctx) ir_if(operand(r09B2).val);
> +   exec_list *const f09B1_parent_instructions = body.instructions;
> +
> +      /* THEN INSTRUCTIONS */
> +      body.instructions = &f09B1->then_instructions;
> +
> +      ir_variable *const r09B3 = new(mem_ctx) ir_variable(glsl_type::float_type, "rval", ir_var_auto);
> +      body.emit(r09B3);
> +      ir_expression *const r09B4 = lshift(swizzle_y(r09A9), body.constant(int(12)));
> +      ir_expression *const r09B5 = rshift(swizzle_x(r09A9), body.constant(int(20)));
> +      body.emit(assign(r09AA, bit_or(r09B4, r09B5), 0x02));
> +
> +      body.emit(assign(r09AA, lshift(swizzle_x(r09A9), body.constant(int(12))), 0x01));
> +
> +      ir_expression *const r09B6 = lshift(r09B0, body.constant(int(31)));
> +      ir_expression *const r09B7 = bit_or(r09B6, body.constant(2143289344u));
> +      ir_expression *const r09B8 = rshift(swizzle_y(r09AA), body.constant(int(9)));
> +      ir_expression *const r09B9 = bit_or(r09B7, r09B8);
> +      body.emit(assign(r09B3, expr(ir_unop_bitcast_u2f, r09B9), 0x01));
> +
> +      ir_variable *const r09BA = body.make_temp(glsl_type::float_type, "mix_retval");
> +      ir_expression *const r09BB = bit_or(r09AC, swizzle_x(r09A9));
> +      ir_expression *const r09BC = nequal(r09BB, body.constant(0u));
> +      ir_expression *const r09BD = lshift(r09B0, body.constant(int(31)));
> +      ir_expression *const r09BE = add(r09BD, body.constant(2139095040u));
> +      ir_expression *const r09BF = expr(ir_unop_bitcast_u2f, r09BE);
> +      body.emit(assign(r09BA, expr(ir_triop_csel, r09BC, r09B3, r09BF), 0x01));
> +
> +      body.emit(assign(r09B3, r09BA, 0x01));
> +
> +      body.emit(assign(r09AB, r09BA, 0x01));
> +
> +
> +      /* ELSE INSTRUCTIONS */
> +      body.instructions = &f09B1->else_instructions;
> +
> +      ir_variable *const r09C0 = body.make_temp(glsl_type::uint_type, "mix_retval");
> +      ir_expression *const r09C1 = lshift(r09AC, body.constant(int(10)));
> +      ir_expression *const r09C2 = rshift(swizzle_x(r09A9), body.constant(int(22)));
> +      ir_expression *const r09C3 = bit_or(r09C1, r09C2);
> +      ir_expression *const r09C4 = lshift(swizzle_x(r09A9), body.constant(int(10)));
> +      ir_expression *const r09C5 = nequal(r09C4, body.constant(0u));
> +      ir_expression *const r09C6 = expr(ir_unop_b2i, r09C5);
> +      ir_expression *const r09C7 = expr(ir_unop_i2u, r09C6);
> +      body.emit(assign(r09C0, bit_or(r09C3, r09C7), 0x01));
> +
> +      ir_variable *const r09C8 = body.make_temp(glsl_type::uint_type, "mix_retval");
> +      ir_expression *const r09C9 = nequal(r09AD, body.constant(int(0)));
> +      ir_expression *const r09CA = bit_or(r09C0, body.constant(1073741824u));
> +      body.emit(assign(r09C8, expr(ir_triop_csel, r09C9, r09CA, r09C0), 0x01));
> +
> +      ir_variable *const r09CB = body.make_temp(glsl_type::int_type, "zExp");
> +      body.emit(assign(r09CB, add(r09AD, body.constant(int(-897))), 0x01));
> +
> +      ir_variable *const r09CC = body.make_temp(glsl_type::uint_type, "zFrac");
> +      body.emit(assign(r09CC, r09C8, 0x01));
> +
> +      ir_variable *const r09CD = body.make_temp(glsl_type::bool_type, "execute_flag");
> +      body.emit(assign(r09CD, body.constant(true), 0x01));
> +
> +      ir_variable *const r09CE = body.make_temp(glsl_type::float_type, "return_value");
> +      ir_variable *const r09CF = new(mem_ctx) ir_variable(glsl_type::int_type, "roundBits", ir_var_auto);
> +      body.emit(r09CF);
> +      ir_expression *const r09D0 = bit_and(r09C8, body.constant(127u));
> +      body.emit(assign(r09CF, expr(ir_unop_u2i, r09D0), 0x01));
> +
> +      /* IF CONDITION */
> +      ir_expression *const r09D2 = expr(ir_unop_i2u, r09CB);
> +      ir_expression *const r09D3 = gequal(r09D2, body.constant(253u));
> +      ir_if *f09D1 = new(mem_ctx) ir_if(operand(r09D3).val);
> +      exec_list *const f09D1_parent_instructions = body.instructions;
> +
> +         /* THEN INSTRUCTIONS */
> +         body.instructions = &f09D1->then_instructions;
> +
> +         /* IF CONDITION */
> +         ir_expression *const r09D5 = less(body.constant(int(253)), r09CB);
> +         ir_expression *const r09D6 = equal(r09CB, body.constant(int(253)));
> +         ir_expression *const r09D7 = expr(ir_unop_u2i, r09C8);
> +         ir_expression *const r09D8 = less(r09D7, body.constant(int(-64)));
> +         ir_expression *const r09D9 = logic_and(r09D6, r09D8);
> +         ir_expression *const r09DA = logic_or(r09D5, r09D9);
> +         ir_if *f09D4 = new(mem_ctx) ir_if(operand(r09DA).val);
> +         exec_list *const f09D4_parent_instructions = body.instructions;
> +
> +            /* THEN INSTRUCTIONS */
> +            body.instructions = &f09D4->then_instructions;
> +
> +            ir_expression *const r09DB = lshift(r09B0, body.constant(int(31)));
> +            ir_expression *const r09DC = add(r09DB, body.constant(2139095040u));
> +            body.emit(assign(r09CE, expr(ir_unop_bitcast_u2f, r09DC), 0x01));
> +
> +            body.emit(assign(r09CD, body.constant(false), 0x01));
> +
> +
> +            /* ELSE INSTRUCTIONS */
> +            body.instructions = &f09D4->else_instructions;
> +
> +            ir_variable *const r09DD = body.make_temp(glsl_type::int_type, "assignment_tmp");
> +            body.emit(assign(r09DD, neg(r09CB), 0x01));
> +
> +            ir_variable *const r09DE = body.make_temp(glsl_type::bool_type, "assignment_tmp");
> +            body.emit(assign(r09DE, less(r09CB, body.constant(int(0))), 0x01));
> +
> +            ir_variable *const r09DF = body.make_temp(glsl_type::uint_type, "mix_retval");
> +            ir_expression *const r09E0 = neg(r09CB);
> +            ir_expression *const r09E1 = less(r09E0, body.constant(int(32)));
> +            ir_expression *const r09E2 = rshift(r09C8, r09DD);
> +            ir_expression *const r09E3 = neg(r09DD);
> +            ir_expression *const r09E4 = bit_and(r09E3, body.constant(int(31)));
> +            ir_expression *const r09E5 = lshift(r09C8, r09E4);
> +            ir_expression *const r09E6 = nequal(r09E5, body.constant(0u));
> +            ir_expression *const r09E7 = expr(ir_unop_b2i, r09E6);
> +            ir_expression *const r09E8 = expr(ir_unop_i2u, r09E7);
> +            ir_expression *const r09E9 = bit_or(r09E2, r09E8);
> +            ir_expression *const r09EA = nequal(r09C8, body.constant(0u));
> +            ir_expression *const r09EB = expr(ir_unop_b2i, r09EA);
> +            ir_expression *const r09EC = expr(ir_unop_i2u, r09EB);
> +            ir_expression *const r09ED = expr(ir_triop_csel, r09E1, r09E9, r09EC);
> +            body.emit(assign(r09DF, expr(ir_triop_csel, r09DE, r09ED, r09C8), 0x01));
> +
> +            body.emit(assign(r09CC, r09DF, 0x01));
> +
> +            ir_expression *const r09EE = expr(ir_unop_u2i, r09DF);
> +            ir_expression *const r09EF = bit_and(r09EE, body.constant(int(127)));
> +            body.emit(assign(r09CF, expr(ir_triop_csel, r09DE, r09EF, r09CF), 0x01));
> +
> +            body.emit(assign(r09CB, expr(ir_triop_csel, r09DE, body.constant(int(0)), r09CB), 0x01));
> +
> +
> +         body.instructions = f09D4_parent_instructions;
> +         body.emit(f09D4);
> +
> +         /* END IF */
> +
> +
> +      body.instructions = f09D1_parent_instructions;
> +      body.emit(f09D1);
> +
> +      /* END IF */
> +
> +      /* IF CONDITION */
> +      ir_if *f09F0 = new(mem_ctx) ir_if(operand(r09CD).val);
> +      exec_list *const f09F0_parent_instructions = body.instructions;
> +
> +         /* THEN INSTRUCTIONS */
> +         body.instructions = &f09F0->then_instructions;
> +
> +         ir_expression *const r09F1 = add(r09CC, body.constant(64u));
> +         body.emit(assign(r09CC, rshift(r09F1, body.constant(int(7))), 0x01));
> +
> +         ir_expression *const r09F2 = bit_xor(r09CF, body.constant(int(64)));
> +         ir_expression *const r09F3 = equal(r09F2, body.constant(int(0)));
> +         ir_expression *const r09F4 = expr(ir_unop_b2i, r09F3);
> +         ir_expression *const r09F5 = expr(ir_unop_i2u, r09F4);
> +         ir_expression *const r09F6 = expr(ir_unop_bit_not, r09F5);
> +         body.emit(assign(r09CC, bit_and(r09CC, r09F6), 0x01));
> +
> +         ir_expression *const r09F7 = lshift(r09B0, body.constant(int(31)));
> +         ir_expression *const r09F8 = equal(r09CC, body.constant(0u));
> +         ir_expression *const r09F9 = expr(ir_triop_csel, r09F8, body.constant(int(0)), r09CB);
> +         ir_expression *const r09FA = expr(ir_unop_i2u, r09F9);
> +         ir_expression *const r09FB = lshift(r09FA, body.constant(int(23)));
> +         ir_expression *const r09FC = add(r09F7, r09FB);
> +         ir_expression *const r09FD = add(r09FC, r09CC);
> +         body.emit(assign(r09CE, expr(ir_unop_bitcast_u2f, r09FD), 0x01));
> +
> +         body.emit(assign(r09CD, body.constant(false), 0x01));
> +
> +
> +      body.instructions = f09F0_parent_instructions;
> +      body.emit(f09F0);
> +
> +      /* END IF */
> +
> +      body.emit(assign(r09AB, r09CE, 0x01));
> +
> +
> +   body.instructions = f09B1_parent_instructions;
> +   body.emit(f09B1);
> +
> +   /* END IF */
> +
> +   ir_variable *const r09FE = body.make_temp(glsl_type::bool_type, "execute_flag");
> +   body.emit(assign(r09FE, body.constant(true), 0x01));
> +
> +   ir_variable *const r09FF = body.make_temp(glsl_type::uvec2_type, "return_value");
> +   ir_variable *const r0A00 = new(mem_ctx) ir_variable(glsl_type::uint_type, "aSign", ir_var_auto);
> +   body.emit(r0A00);
> +   ir_variable *const r0A01 = new(mem_ctx) ir_variable(glsl_type::int_type, "aExp", ir_var_auto);
> +   body.emit(r0A01);
> +   ir_variable *const r0A02 = new(mem_ctx) ir_variable(glsl_type::uint_type, "aFrac", ir_var_auto);
> +   body.emit(r0A02);
> +   ir_variable *const r0A03 = body.make_temp(glsl_type::uint_type, "floatBitsToUint_retval");
> +   ir_expression *const r0A04 = expr(ir_unop_sqrt, r09AB);
> +   body.emit(assign(r0A03, expr(ir_unop_bitcast_f2u, r0A04), 0x01));
> +
> +   ir_variable *const r0A05 = body.make_temp(glsl_type::uint_type, "assignment_tmp");
> +   body.emit(assign(r0A05, bit_and(r0A03, body.constant(8388607u)), 0x01));
> +
> +   body.emit(assign(r0A02, r0A05, 0x01));
> +
> +   ir_variable *const r0A06 = body.make_temp(glsl_type::int_type, "assignment_tmp");
> +   ir_expression *const r0A07 = rshift(r0A03, body.constant(int(23)));
> +   ir_expression *const r0A08 = bit_and(r0A07, body.constant(255u));
> +   body.emit(assign(r0A06, expr(ir_unop_u2i, r0A08), 0x01));
> +
> +   body.emit(assign(r0A01, r0A06, 0x01));
> +
> +   body.emit(assign(r0A00, rshift(r0A03, body.constant(int(31))), 0x01));
> +
> +   /* IF CONDITION */
> +   ir_expression *const r0A0A = equal(r0A06, body.constant(int(255)));
> +   ir_if *f0A09 = new(mem_ctx) ir_if(operand(r0A0A).val);
> +   exec_list *const f0A09_parent_instructions = body.instructions;
> +
> +      /* THEN INSTRUCTIONS */
> +      body.instructions = &f0A09->then_instructions;
> +
> +      /* IF CONDITION */
> +      ir_expression *const r0A0C = nequal(r0A05, body.constant(0u));
> +      ir_if *f0A0B = new(mem_ctx) ir_if(operand(r0A0C).val);
> +      exec_list *const f0A0B_parent_instructions = body.instructions;
> +
> +         /* THEN INSTRUCTIONS */
> +         body.instructions = &f0A0B->then_instructions;
> +
> +         ir_variable *const r0A0D = body.make_temp(glsl_type::uint_type, "assignment_tmp");
> +         body.emit(assign(r0A0D, lshift(r0A03, body.constant(int(9))), 0x01));
> +
> +         ir_variable *const r0A0E = body.make_temp(glsl_type::uvec2_type, "vec_ctor");
> +         ir_expression *const r0A0F = lshift(r0A0D, body.constant(int(20)));
> +         body.emit(assign(r0A0E, bit_or(r0A0F, body.constant(0u)), 0x01));
> +
> +         ir_expression *const r0A10 = rshift(r0A0D, body.constant(int(12)));
> +         ir_expression *const r0A11 = lshift(r0A00, body.constant(int(31)));
> +         ir_expression *const r0A12 = bit_or(r0A11, body.constant(2146959360u));
> +         body.emit(assign(r0A0E, bit_or(r0A10, r0A12), 0x02));
> +
> +         body.emit(assign(r09FF, r0A0E, 0x03));
> +
> +         body.emit(assign(r09FE, body.constant(false), 0x01));
> +
> +
> +         /* ELSE INSTRUCTIONS */
> +         body.instructions = &f0A0B->else_instructions;
> +
> +         ir_variable *const r0A13 = new(mem_ctx) ir_variable(glsl_type::uvec2_type, "z", ir_var_auto);
> +         body.emit(r0A13);
> +         ir_expression *const r0A14 = lshift(r0A00, body.constant(int(31)));
> +         body.emit(assign(r0A13, add(r0A14, body.constant(2146435072u)), 0x02));
> +
> +         body.emit(assign(r0A13, body.constant(0u), 0x01));
> +
> +         body.emit(assign(r09FF, r0A13, 0x03));
> +
> +         body.emit(assign(r09FE, body.constant(false), 0x01));
> +
> +
> +      body.instructions = f0A0B_parent_instructions;
> +      body.emit(f0A0B);
> +
> +      /* END IF */
> +
> +
> +      /* ELSE INSTRUCTIONS */
> +      body.instructions = &f0A09->else_instructions;
> +
> +      /* IF CONDITION */
> +      ir_expression *const r0A16 = equal(r0A06, body.constant(int(0)));
> +      ir_if *f0A15 = new(mem_ctx) ir_if(operand(r0A16).val);
> +      exec_list *const f0A15_parent_instructions = body.instructions;
> +
> +         /* THEN INSTRUCTIONS */
> +         body.instructions = &f0A15->then_instructions;
> +
> +         /* IF CONDITION */
> +         ir_expression *const r0A18 = equal(r0A05, body.constant(0u));
> +         ir_if *f0A17 = new(mem_ctx) ir_if(operand(r0A18).val);
> +         exec_list *const f0A17_parent_instructions = body.instructions;
> +
> +            /* THEN INSTRUCTIONS */
> +            body.instructions = &f0A17->then_instructions;
> +
> +            ir_variable *const r0A19 = new(mem_ctx) ir_variable(glsl_type::uvec2_type, "z", ir_var_auto);
> +            body.emit(r0A19);
> +            body.emit(assign(r0A19, lshift(r0A00, body.constant(int(31))), 0x02));
> +
> +            body.emit(assign(r0A19, body.constant(0u), 0x01));
> +
> +            body.emit(assign(r09FF, r0A19, 0x03));
> +
> +            body.emit(assign(r09FE, body.constant(false), 0x01));
> +
> +
> +            /* ELSE INSTRUCTIONS */
> +            body.instructions = &f0A17->else_instructions;
> +
> +            ir_variable *const r0A1A = body.make_temp(glsl_type::int_type, "assignment_tmp");
> +            ir_expression *const r0A1B = equal(r0A05, body.constant(0u));
> +            ir_expression *const r0A1C = expr(ir_unop_find_msb, r0A05);
> +            ir_expression *const r0A1D = sub(body.constant(int(31)), r0A1C);
> +            ir_expression *const r0A1E = expr(ir_triop_csel, r0A1B, body.constant(int(32)), r0A1D);
> +            body.emit(assign(r0A1A, add(r0A1E, body.constant(int(-8))), 0x01));
> +
> +            body.emit(assign(r0A02, lshift(r0A05, r0A1A), 0x01));
> +
> +            body.emit(assign(r0A01, sub(body.constant(int(1)), r0A1A), 0x01));
> +
> +            body.emit(assign(r0A01, add(r0A01, body.constant(int(-1))), 0x01));
> +
> +
> +         body.instructions = f0A17_parent_instructions;
> +         body.emit(f0A17);
> +
> +         /* END IF */
> +
> +
> +      body.instructions = f0A15_parent_instructions;
> +      body.emit(f0A15);
> +
> +      /* END IF */
> +
> +      /* IF CONDITION */
> +      ir_if *f0A1F = new(mem_ctx) ir_if(operand(r09FE).val);
> +      exec_list *const f0A1F_parent_instructions = body.instructions;
> +
> +         /* THEN INSTRUCTIONS */
> +         body.instructions = &f0A1F->then_instructions;
> +
> +         ir_variable *const r0A20 = new(mem_ctx) ir_variable(glsl_type::uvec2_type, "z", ir_var_auto);
> +         body.emit(r0A20);
> +         ir_expression *const r0A21 = lshift(r0A00, body.constant(int(31)));
> +         ir_expression *const r0A22 = add(r0A01, body.constant(int(896)));
> +         ir_expression *const r0A23 = expr(ir_unop_i2u, r0A22);
> +         ir_expression *const r0A24 = lshift(r0A23, body.constant(int(20)));
> +         ir_expression *const r0A25 = add(r0A21, r0A24);
> +         ir_expression *const r0A26 = rshift(r0A02, body.constant(int(3)));
> +         body.emit(assign(r0A20, add(r0A25, r0A26), 0x02));
> +
> +         ir_expression *const r0A27 = lshift(r0A02, body.constant(int(29)));
> +         body.emit(assign(r0A20, bit_or(r0A27, body.constant(0u)), 0x01));
> +
> +         body.emit(assign(r09FF, r0A20, 0x03));
> +
> +         body.emit(assign(r09FE, body.constant(false), 0x01));
> +
> +
> +      body.instructions = f0A1F_parent_instructions;
> +      body.emit(f0A1F);
> +
> +      /* END IF */
> +
> +
> +   body.instructions = f0A09_parent_instructions;
> +   body.emit(f0A09);
> +
> +   /* END IF */
> +
> +   body.emit(ret(r09FF));
> +
> +   sig->replace_parameters(&sig_parameters);
> +   return sig;
> +}
> diff --git a/src/compiler/glsl/builtin_functions.cpp b/src/compiler/glsl/builtin_functions.cpp
> index 48e0b20..d919873 100644
> --- a/src/compiler/glsl/builtin_functions.cpp
> +++ b/src/compiler/glsl/builtin_functions.cpp
> @@ -3394,6 +3394,10 @@ builtin_builder::create_builtins()
>                  generate_ir::int_to_fp64(mem_ctx, integer_functions_supported),
>                  NULL);
>  
> +   add_function("__builtin_fsqrt64",
> +                generate_ir::fsqrt64(mem_ctx, integer_functions_supported),
> +                NULL);
> +
>  #undef F
>  #undef FI
>  #undef FIUD_VEC
> diff --git a/src/compiler/glsl/builtin_functions.h b/src/compiler/glsl/builtin_functions.h
> index f9cc0ad..2f72f51 100644
> --- a/src/compiler/glsl/builtin_functions.h
> +++ b/src/compiler/glsl/builtin_functions.h
> @@ -106,6 +106,9 @@ fp64_to_fp32(void *mem_ctx, builtin_available_predicate avail);
>  ir_function_signature *
>  fp32_to_fp64(void *mem_ctx, builtin_available_predicate avail);
>  
> +ir_function_signature *
> +fsqrt64(void *mem_ctx, builtin_available_predicate avail);
> +
>  }
>  
>  #endif /* BULITIN_FUNCTIONS_H */
> diff --git a/src/compiler/glsl/float64.glsl b/src/compiler/glsl/float64.glsl
> index 748e4af..c03f0f6 100644
> --- a/src/compiler/glsl/float64.glsl
> +++ b/src/compiler/glsl/float64.glsl
> @@ -1014,3 +1014,278 @@ fp32_to_fp64(float f)
>     shift64Right(aFrac, 0u, 3, zFrac0, zFrac1);
>     return packFloat64(aSign, aExp + 0x380, zFrac0, zFrac1);
>  }
> +
> +/* Adds the 96-bit value formed by concatenating `a0', `a1', and `a2' to the
> + * 96-bit value formed by concatenating `b0', `b1', and `b2'.  Addition is
> + * modulo 2^96, so any carry out is lost.  The result is broken into three
> + * 32-bit pieces which are stored at the locations pointed to by `z0Ptr',
> + * `z1Ptr', and `z2Ptr'.
> + */
> +/*void
> +add96(uint a0, uint a1, uint a2,
> +      uint b0, uint b1, uint b2,
> +      inout uint z0Ptr,
> +      inout uint z1Ptr,
> +      inout uint z2Ptr)
> +{
> +   uint z2 = a2 + b2;
> +   uint carry1 = uint(z2 < a2);
> +   uint z1 = a1 + b1;
> +   uint carry0 = uint(z1 < a1);
> +   uint z0 = a0 + b0;
> +   z1 += carry1;
> +   z0 += uint(z1 < carry1);
> +   z0 += carry0;
> +   z2Ptr = z2;
> +   z1Ptr = z1;
> +   z0Ptr = z0;
> +}*/
> +
> +/* Subtracts the 96-bit value formed by concatenating `b0', `b1', and `b2' from
> + * the 96-bit value formed by concatenating `a0', `a1', and `a2'.  Subtraction
> + * is modulo 2^96, so any borrow out (carry out) is lost.  The result is broken
> + * into three 32-bit pieces which are stored at the locations pointed to by
> + * `z0Ptr', `z1Ptr', and `z2Ptr'.
> + */
> +/*void
> +sub96(uint a0, uint a1, uint a2,
> +      uint b0, uint b1, uint b2,
> +      inout uint z0Ptr,
> +      inout uint z1Ptr,
> +      inout uint z2Ptr)
> +{
> +   uint z2 = a2 - b2;
> +   uint borrow1 = uint(a2 < b2);
> +   uint z1 = a1 - b1;
> +   uint borrow0 = uint(a1 < b1);
> +   uint z0 = a0 - b0;
> +   z0 -= uint(z1 < borrow1);
> +   z1 -= borrow1;
> +   z0 -= borrow0;
> +   z2Ptr = z2;
> +   z1Ptr = z1;
> +   z0Ptr = z0;
> +}*/
> +
> +/* Returns an approximation to the 32-bit integer quotient obtained by dividing
> + * `b' into the 64-bit value formed by concatenating `a0' and `a1'.  The
> + * divisor `b' must be at least 2^31.  If q is the exact quotient truncated
> + * toward zero, the approximation returned lies between q and q + 2 inclusive.
> + * If the exact quotient q is larger than 32 bits, the maximum positive 32-bit
> + * unsigned integer is returned.
> + */
> +/*uint
> +estimateDiv64To32(uint a0, uint a1, uint b)
> +{
> +   uint b0;
> +   uint b1;
> +   uint rem0 = 0u;
> +   uint rem1 = 0u;
> +   uint term0 = 0u;
> +   uint term1 = 0u;
> +   uint z;
> +
> +   if (b <= a0)
> +      return 0xFFFFFFFFu;
> +   b0 = b>>16;
> +   z = (b0<<16 <= a0) ? 0xFFFF0000u : (a0 / b0)<<16;
> +   mul32To64(b, z, term0, term1);
> +   sub64(a0, a1, term0, term1, rem0, rem1);
> +   while (int(rem0) < 0) {
> +      z -= 0x10000u;
> +      b1 = b<<16;
> +      add64(rem0, rem1, b0, b1, rem0, rem1);
> +   }
> +   rem0 = (rem0<<16) | (rem1>>16);
> +   z |= (b0<<16 <= rem0) ? 0xFFFFu : rem0 / b0;
> +   return z;
> +}*/
> +
> +/*uint
> +sqrtOddAdjustments(int index)
> +{
> +   uint res = 0u;
> +   if (index == 0)
> +      res = 0x0004u;
> +   if (index == 1)
> +      res = 0x0022u;
> +   if (index == 2)
> +      res = 0x005Du;
> +   if (index == 3)
> +      res = 0x00B1u;
> +   if (index == 4)
> +      res = 0x011Du;
> +   if (index == 5)
> +      res = 0x019Fu;
> +   if (index == 6)
> +      res = 0x0236u;
> +   if (index == 7)
> +      res = 0x02E0u;
> +   if (index == 8)
> +      res = 0x039Cu;
> +   if (index == 9)
> +      res = 0x0468u;
> +   if (index == 10)
> +      res = 0x0545u;
> +   if (index == 11)
> +      res = 0x631u;
> +   if (index == 12)
> +      res = 0x072Bu;
> +   if (index == 13)
> +      res = 0x0832u;
> +   if (index == 14)
> +      res = 0x0946u;
> +   if (index == 15)
> +      res = 0x0A67u;
> +
> +   return res;
> +}
> +
> +uint
> +sqrtEvenAdjustments(int index)
> +{
> +   uint res = 0u;
> +   if (index == 0)
> +      res = 0x0A2Du;
> +   if (index == 1)
> +      res = 0x08AFu;
> +   if (index == 2)
> +      res = 0x075Au;
> +   if (index == 3)
> +      res = 0x0629u;
> +   if (index == 4)
> +      res = 0x051Au;
> +   if (index == 5)
> +      res = 0x0429u;
> +   if (index == 6)
> +      res = 0x0356u;
> +   if (index == 7)
> +      res = 0x029Eu;
> +   if (index == 8)
> +      res = 0x0200u;
> +   if (index == 9)
> +      res = 0x0179u;
> +   if (index == 10)
> +      res = 0x0109u;
> +   if (index == 11)
> +      res = 0x00AFu;
> +   if (index == 12)
> +      res = 0x0068u;
> +   if (index == 13)
> +      res = 0x0034u;
> +   if (index == 14)
> +      res = 0x0012u;
> +   if (index == 15)
> +      res = 0x0002u;
> +
> +   return res;
> +}*/
> +

I've mentioned that before, but this really looks terrible to me. If
gpus wouldn't operate in a simd-manner that might be ok...
I have some doubts this is the right approach for vectors, but you could
at least more or less trivially reduce the ifs by a factor of 2 by
packing 2 values into one plus a final shift / mix.
But well, it should be correct I suppose...
And in any case, it's all commented out so no big deal.

> +/* Returns an approximation to the square root of the 32-bit significand given
> + * by `a'.  Considered as an integer, `a' must be at least 2^31.  If bit 0 of
> + * `aExp' (the least significant bit) is 1, the integer returned approximates
> + * 2^31*sqrt(`a'/2^31), where `a' is considered an integer.  If bit 0 of `aExp'
> + * is 0, the integer returned approximates 2^31*sqrt(`a'/2^30).  In either
> + * case, the approximation returned lies strictly within +/-2 of the exact
> + * value.
> + */
> +/*uint estimateSqrt32(int aExp, uint a)
> +{
> +   uint z;
> +
> +   int index = int(a>>27 & 15u);
> +   if ((aExp & 1) != 0) {
> +      z = 0x4000u + (a>>17) - sqrtOddAdjustments(index);
> +      z = ((a / z)<<14) + (z<<15);
> +      a >>= 1;
> +   } else {
> +      z = 0x8000u + (a>>17) - sqrtEvenAdjustments(index);
> +      z = a / z + z;
> +      z = (0x20000u <= z) ? 0xFFFF8000u : (z<<15);
> +      if (z <= a)
> +         return uint(int(a)>>1);
> +   }
> +   return ((estimateDiv64To32(a, 0u, z))>>1) + (z>>1);
> +}*/
> +
> +/* Returns the square root of the double-precision floating-point value `a'.
> + * The operation is performed according to the IEEE Standard for Floating-Point
> + * Arithmetic.
> + */
> +uvec2
> +fsqrt64(uvec2 a)
> +{
> +/*   uint zFrac0 = 0u;
> +   uint zFrac1 = 0u;
> +   uint zFrac2 = 0u;
> +   uint doubleZFrac0 = 0u;
> +   uint rem0 = 0u;
> +   uint rem1 = 0u;
> +   uint rem2 = 0u;
> +   uint rem3 = 0u;
> +   uint term0 = 0u;
> +   uint term1 = 0u;
> +   uint term2 = 0u;
> +   uint term3 = 0u;
> +   uvec2 default_nan;
> +   default_nan.y = 0xFFFFFFFFu;
> +   default_nan.x = 0xFFFFFFFFu;
> +
> +   uint aFracLo = extractFloat64FracLo(a);
> +   uint aFracHi = extractFloat64FracHi(a);
> +   int aExp = extractFloat64Exp(a);
> +   uint aSign = extractFloat64Sign(a);
> +   if (aExp == 0x7FF) {
> +      if ((aFracHi | aFracLo) != 0u)
> +         return propagateFloat64NaN(a, a);
> +      if (aSign == 0u)
> +         return a;
> +      return default_nan;
> +   }
> +   if (aSign != 0u) {
> +      if ((uint(aExp) | aFracHi | aFracLo) == 0u)
> +         return a;
> +      return default_nan;
> +   }
> +   if (aExp == 0) {
> +      if ((aFracHi | aFracLo) == 0u)
> +         return packFloat64(0u, 0, 0u, 0u);
> +      normalizeFloat64Subnormal(aFracHi, aFracLo, aExp, aFracHi, aFracLo);
> +   }
> +   int zExp = ((aExp - 0x3FF)>>1) + 0x3FE;
> +   aFracHi |= 0x00100000u;
> +   shortShift64Left(aFracHi, aFracLo, 11, term0, term1);
> +   zFrac0 = (estimateSqrt32(aExp, term0)>>1) + 1u;
> +   if (zFrac0 == 0u)
> +      zFrac0 = 0x7FFFFFFFu;
> +   doubleZFrac0 = zFrac0 + zFrac0;
> +   shortShift64Left(aFracHi, aFracLo, 9 - (aExp & 1), aFracHi, aFracLo);
> +   mul32To64(zFrac0, zFrac0, term0, term1);
> +   sub64(aFracHi, aFracLo, term0, term1, rem0, rem1);
> +   while (int(rem0) < 0) {
> +      --zFrac0;
> +      doubleZFrac0 -= 2u;
> +      add64(rem0, rem1, 0u, doubleZFrac0 | 1u, rem0, rem1);
> +   }
> +   zFrac1 = estimateDiv64To32(rem1, 0u, doubleZFrac0);
> +   if ((zFrac1 & 0x1FFu) <= 5u) {
> +      if (zFrac1 == 0u)
> +         zFrac1 = 1u;
> +      mul32To64(doubleZFrac0, zFrac1, term1, term2);
> +      sub64(rem1, 0u, term1, term2, rem1, rem2);
> +      mul32To64(zFrac1, zFrac1, term2, term3);
> +      sub96(rem1, rem2, 0u, 0u, term2, term3, rem1, rem2, rem3);
> +      while (int(rem1) < 0) {
> +         --zFrac1;
> +         shortShift64Left(0u, zFrac1, 1, term2, term3);
> +         term3 |= 1u;
> +         term2 |= doubleZFrac0;
> +         add96(rem1, rem2, rem3, 0u, term2, term3, rem1, rem2, rem3);
> +      }
> +      zFrac1 |= uint((rem1 | rem2 | rem3) != 0u);
> +   }
> +   shift64ExtraRightJamming(zFrac0, zFrac1, 0u, 10, zFrac0, zFrac1, zFrac2);
> +   return roundAndPackFloat64(0u, zExp, zFrac0, zFrac1, zFrac2);*/
> +
> +   return fp32_to_fp64(sqrt(fp64_to_fp32(a)));
Quite surprising you get away with this...
The biggest problem imho is that for sufficiently large exponents you
get a INF back - it's probably surprising that sqrt(+finite_num) ==
+INF... (and likewise, for sufficiently small exponents you get back 0,
which is just as surprising.)

In any case, glsl certainly wouldn't require totally accurate results
(as the commented logic above would do), but that definitely no longer
falls into glsl's rather lax requirements.

Roland

> +}
> diff --git a/src/compiler/glsl/glcpp/glcpp-parse.y b/src/compiler/glsl/glcpp/glcpp-parse.y
> index 3fcdcb0..d2411c5 100644
> --- a/src/compiler/glsl/glcpp/glcpp-parse.y
> +++ b/src/compiler/glsl/glcpp/glcpp-parse.y
> @@ -2381,6 +2381,7 @@ _glcpp_parser_handle_version_declaration(glcpp_parser_t *parser, intmax_t versio
>           add_builtin_define(parser, "__have_builtin_builtin_int_to_fp64", 1);
>           add_builtin_define(parser, "__have_builtin_builtin_fp64_to_fp32", 1);
>           add_builtin_define(parser, "__have_builtin_builtin_fp32_to_fp64", 1);
> +         add_builtin_define(parser, "__have_builtin_builtin_fsqrt64", 1);
>        }
>     }
>  
>