[Mesa-dev] [PATCH] gallivm: Fix lp_build_float_to_half.

Thu Nov 29 08:47:25 PST 2012

Am 29.11.2012 16:50, schrieb Jose Fonseca:
> 
> 
> ----- Original Message -----
>> On 11/29/2012 08:32 AM, jfonseca at vmware.com wrote:
>>> From: José Fonseca<jfonseca at vmware.com>
>>>
>>> The current implementation was close by not fully correct: several
>>> operations that should be done in floating point were being done in
>>> integer.
>>>
>>> Fixes piglit fbo-clear-formats GL_ARB_texture_float
>>> ---
>>>   src/gallium/auxiliary/gallivm/lp_bld_conv.c |  103
>>>   +++++++++++++++++++--------
>>>   1 file changed, 73 insertions(+), 30 deletions(-)
>>>
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>>> b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>>> index cd18b0c..a48b053 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_conv.c
>>> @@ -63,15 +63,18 @@
>>>
>>>   #include "util/u_debug.h"
>>>   #include "util/u_math.h"
>>> +#include "util/u_half.h"
>>>   #include "util/u_cpu_detect.h"
>>>
>>>   #include "lp_bld_type.h"
>>>   #include "lp_bld_const.h"
>>>   #include "lp_bld_arit.h"
>>> +#include "lp_bld_bitarit.h"
>>>   #include "lp_bld_pack.h"
>>>   #include "lp_bld_conv.h"
>>>   #include "lp_bld_logic.h"
>>>   #include "lp_bld_intr.h"
>>> +#include "lp_bld_printf.h"
>>>
>>>
>>>
>>> @@ -220,64 +223,104 @@ lp_build_half_to_float(struct gallivm_state
>>> *gallivm,
>>>    *
>>>    * ref
>>>    http://fgiesen.wordpress.com/2012/03/28/half-to-float-done-quic/
>>>    * ref https://gist.github.com/2156668
>>> + *
>>> + * XXX: This is approximation. It is is faster certain NaNs are
>>> converted to
>>> + * infinity, and rounding is not correct.
>>
>> I can't quite parse that comment.
> 
> Oops. Will correct.
> 
>>>    */
>>>   LLVMValueRef
>>>   lp_build_float_to_half(struct gallivm_state *gallivm,
>>>                          LLVMValueRef src)
>>>   {
>>> -   struct lp_type i32_type = lp_type_int_vec(32, 32 *
>>> LLVMGetVectorSize(LLVMTypeOf(src)));
>>> -
>>>      LLVMBuilderRef builder = gallivm->builder;
>>> -   LLVMTypeRef int_vec_type = lp_build_vec_type(gallivm,
>>> i32_type);
>>> -
>>> -   struct lp_build_context bld;
>>> -
>>> +   LLVMTypeRef f32_vec_type = LLVMTypeOf(src);
>>> +   unsigned length = LLVMGetTypeKind(f32_vec_type) ==
>>> LLVMVectorTypeKind
>>> +                   ? LLVMGetVectorSize(f32_vec_type) : 1;
>>> +   struct lp_type f32_type = lp_type_float_vec(32, 32 * length);
>>> +   struct lp_type u32_type = lp_type_uint_vec(32, 32 * length);
>>> +   struct lp_type i16_type = lp_type_int_vec(16, 16 * length);
>>> +   LLVMTypeRef u32_vec_type = lp_build_vec_type(gallivm,
>>> u32_type);
>>> +   LLVMTypeRef i16_vec_type = lp_build_vec_type(gallivm,
>>> i16_type);
>>> +   struct lp_build_context f32_bld;
>>> +   struct lp_build_context u32_bld;
>>>      LLVMValueRef result;
>>>
>>> -   lp_build_context_init(&bld, gallivm, i32_type);
>>> +   lp_build_context_init(&f32_bld, gallivm, f32_type);
>>> +   lp_build_context_init(&u32_bld, gallivm, u32_type);
>>>
>>> -   /* Extra scope because lp_build_min needs a build context, le
>>> sigh */
>>>      {
>>>         /* Constants */
>>> -      LLVMValueRef i32_13        = lp_build_const_int_vec(gallivm,
>>> i32_type, 13);
>>> -      LLVMValueRef i32_16        = lp_build_const_int_vec(gallivm,
>>> i32_type, 16);
>>> -      LLVMValueRef i32_mask_fabs = lp_build_const_int_vec(gallivm,
>>> i32_type, 0x7fffffff);
>>> -      LLVMValueRef i32_f32infty  = lp_build_const_int_vec(gallivm,
>>> i32_type, 0xff<<  23);
>>> -      LLVMValueRef i32_expinf    = lp_build_const_int_vec(gallivm,
>>> i32_type, 0xe0<<  23);
>>> -      LLVMValueRef i32_f16max    = lp_build_const_int_vec(gallivm,
>>> i32_type, 0x8f<<  23);
>>> -      LLVMValueRef i32_magic     = lp_build_const_int_vec(gallivm,
>>> i32_type, 0x0f<<  23);
>>> +      LLVMValueRef u32_f32inf    = lp_build_const_int_vec(gallivm,
>>> u32_type, 0xff<<  23);
>>> +      LLVMValueRef u32_expinf    = lp_build_const_int_vec(gallivm,
>>> u32_type, 0xe0<<  23);
>>> +      LLVMValueRef f32_f16max    = lp_build_const_vec(gallivm,
>>> f32_type, 65536.0); // 0x8f<<  23
>>> +      LLVMValueRef f32_magic     = lp_build_const_vec(gallivm,
>>> f32_type, 1.92592994e-34); // 0x0f<<  23
>>>
>>>         /* Cast from float32 to int32 */
>>> -      LLVMValueRef f             = LLVMBuildBitCast(builder, src,
>>> int_vec_type, "");
>>> +      LLVMValueRef f             = LLVMBuildBitCast(builder, src,
>>> u32_vec_type, "");
>>>
>>>         /* Remove sign */
>>> -      LLVMValueRef fabs          = LLVMBuildAnd(builder,
>>> i32_mask_fabs, f, "");
>>> +      LLVMValueRef srcabs         = lp_build_abs(&f32_bld, src);
>>> +      LLVMValueRef fabs           = LLVMBuildBitCast(builder,
>>> srcabs, u32_vec_type, "");
>>>
>>>         /* Magic conversion */
>>> -      LLVMValueRef clamped       = lp_build_min(&bld, i32_f16max,
>>> fabs);
>>> -      LLVMValueRef scaled        = LLVMBuildMul(builder, clamped,
>>> i32_magic, "");
>>> -
>>> +      LLVMValueRef clamped       = lp_build_min(&f32_bld,
>>> f32_f16max, srcabs);
>>> +      LLVMValueRef scaled        = LLVMBuildBitCast(builder,
>>> +
>>>                                                    LLVMBuildFMul(builder,
>>> +
>>>                                                                  clamped,
>>> +
>>>                                                                  f32_magic,
>>> +
>>>                                                                  ""),
>>> +                                                    u32_vec_type,
>>> +                                                    "");
>>>         /* Make sure Inf/NaN and unormalised survive */
>>> -      LLVMValueRef infnancase    = LLVMBuildXor(builder,
>>> i32_expinf, fabs, "");
>>> -      LLVMValueRef b_notnormal   = lp_build_compare(gallivm,
>>> i32_type, PIPE_FUNC_GREATER, fabs, i32_f32infty);
>>> +      LLVMValueRef infnancase    = LLVMBuildXor(builder,
>>> u32_expinf, fabs, "");
>>> +      LLVMValueRef b_notnormal   = lp_build_compare(gallivm,
>>> f32_type, PIPE_FUNC_GEQUAL,
>>> +                                                    srcabs,
>>> +
>>>                                                    LLVMBuildBitCast(builder,
>>> u32_f32inf, f32_vec_type, ""));
>>>
>>>         /* Merge normal / unnormal case */
>>> -      LLVMValueRef merge1        = LLVMBuildAnd(builder,
>>> infnancase, b_notnormal, "");
>>> -      LLVMValueRef merge2        = LLVMBuildNot(builder,
>>> LLVMBuildAnd(builder, b_notnormal, scaled, ""), "");
>>> -      LLVMValueRef merged        = LLVMBuildOr(builder, merge1,
>>> merge2, "");
>>> -      LLVMValueRef shifted       = LLVMBuildLShr(builder, merged,
>>> i32_13, "");
>>> +      LLVMValueRef merged        = lp_build_select(&u32_bld,
>>> b_notnormal, infnancase, scaled);
>>> +      LLVMValueRef shifted       = lp_build_shr_imm(&u32_bld,
>>> merged, 13);
>>>
>>>         /* Sign bit */
>>>         LLVMValueRef justsign      = LLVMBuildXor(builder, f, fabs,
>>>         "");
>>> -      LLVMValueRef signshifted   = LLVMBuildLShr(builder,
>>> justsign, i32_16, "");
>>> +      LLVMValueRef signshifted   = lp_build_shr_imm(&u32_bld,
>>> justsign, 16);
>>>
>>>         /* Combine result */
>>>         result                     = LLVMBuildOr(builder, shifted,
>>>         signshifted, "");
>>>      }
>>>
>>> -   /* Truncate from 32 bit to 16 bit */
>>> -   i32_type.width = 16;
>>> -   return LLVMBuildTrunc(builder, result,
>>> lp_build_vec_type(gallivm, i32_type), "");
>>> +   result = LLVMBuildTrunc(builder, result, i16_vec_type, "");
>>> +
>>
>> The following if (0) block is just debug code, right?  
> 
> Yes.
> 
>> It's a little
>> suspect because it manipulates the 'result' return value.  Maybe add
>> a
>> comment about what's going on.
> 
> I'm sure we'll need to revisit this code, due to the liberties it takes converting, which is why I left the debug code here. I'll add a comment to this effect.
> 
>>
>>> +   if (0) {
>>> +     LLVMTypeRef i32t = LLVMInt32TypeInContext(gallivm->context);
>>> +     LLVMTypeRef i16t = LLVMInt16TypeInContext(gallivm->context);
>>> +     LLVMTypeRef f32t = LLVMFloatTypeInContext(gallivm->context);
>>> +     unsigned i;
>>> +
>>> +     LLVMTypeRef func_type = LLVMFunctionType(i16t,&f32t, 1, 0);
>>> +     LLVMValueRef func = lp_build_const_int_pointer(gallivm,
>>> func_to_pointer((func_pointer)util_float_to_half));
>>> +     func = LLVMBuildBitCast(builder, func,
>>> LLVMPointerType(func_type, 0), "util_float_to_half");
>>> +
>>> +     lp_build_print_value(gallivm, "src  = ", src);
>>> +     lp_build_print_value(gallivm, "util = ", result);
>>> +     lp_build_print_value(gallivm, "llvm = ", result);
>>> +
>>> +     result = LLVMGetUndef(LLVMVectorType(i16t, length));
>>> +     for (i = 0; i<  length; ++i) {
>>> +        LLVMValueRef index = LLVMConstInt(i32t, i, 0);
>>> +        LLVMValueRef f32 = LLVMBuildExtractElement(builder, src,
>>> index, "");
>>> +#if 0
>>> +        /* XXX: not really supported by backends */
>>> +        LLVMValueRef f16 = lp_build_intrinsic_unary(builder,
>>> "llvm.convert.to.fp16", i16t, f32);
>>> +#else
>>> +        LLVMValueRef f16 = LLVMBuildCall(builder, func,&f32, 1,
>>> "");
>>> +#endif
>>> +        result = LLVMBuildInsertElement(builder, result, f16,
>>> index, "");
>>> +     }
>>> +
>>> +     lp_build_printf(gallivm, "\n");
>>> +  }
>>> +
>>> +   return result;
>>>   }
>>>
>>>
>>
>> I didn't study the code in detail (it's pretty dense), but it looks
>> OK
>> on the surface.  Roland?
>>
>> Reviewed-by: Brian Paul <brianp at vmware.com>
> 

Looks good to me as well.
There are llvm intrinisics for converting from/to half-floats
(llvm.convert.from.fp16, llvm.convert.to.fp16) which might be worth a
try - though from the specs it looks like they don't support vectors
which would be very unfortunate (and I don't know if they'd work on
backends which don't have hw support for it). Otherwise, we should
probably also add support for f16c/cvt16 CVTPS2PH/CVTPH2PS instructions
manually on supported cpus (Ivy Bridge, Bulldozer) which do exactly what
we want without any magic (and of course an order of magnitude faster).

Roland