[Mesa-dev] [PATCH] llvmpipe: Remove the special path for TGSI_OPCODE_EXP.

Wed Sep 11 08:12:13 PDT 2013

Am 11.09.2013 16:28, schrieb Jose Fonseca:
> GLSL does not use it.
That's true.

> 
> vs_2_0 does not use it either  http://msdn.microsoft.com/en-us/library/windows/desktop/bb173373(v=vs.85).aspx
Well technically it could be used for that as a cheaper alternative for
exp2 (with just a result swizzle, getting rid of the unused part)
since the accuracy required is less than with ordinary exp2.


> D3D10 doesn't have similar thing neither.
Sure.

> 
> It just didn't seem worth to keep this special path. And it seemed hard to fix
it without breaking NaN/Inf correctness.
Well if that's the only concern that should be trivial to fix, just use
a separate lp_build_fract() (without clamping the input first) for the
result of dst.y - that will still generate way way better code for it
than doing the ex2 twice, not touching the "ordinary" exp2 path really.
But if it's really mostly unused it may not be worth it, I just don't
like making things twice slower if the code is already there without a
really good reason.

Roland


> 
> Jose
> 
> ----- Original Message -----
>> Hmm sure it is rarely used (for arb_vp and d3d9 vs 1.1 (2.0 too maybe
>> though the semantics are different there even if the precision required
>> is the same)?
>> The problem I have with this is that the emulation which will get used
>> instead is _extremely_ terrible. EXP should be a cheaper alternative to
>> EX2, yet the emulation will make it more than twice as expensive
>> (because there are _two_ ex2 calls in exp_emit()).
>> Also, since the exp/log functions actually have configurable precision
>> (though it is compile-time dependent for now) maybe could exploit that
>> and use a polynomial with a lesser degree?
>> Otherwise though having less specialized code makes sense.
>>
>> Roland
>>
>>
>>
>> Am 11.09.2013 13:04, schrieb jfonseca at vmware.com:
>>> From: José Fonseca <jfonseca at vmware.com>
>>>
>>> It was wrong for EXP.y, as we clamped the source before computing the
>>> fractional part, and this opcode should be rarely used, so it's not
>>> worth the hassle.
>>> ---
>>>  src/gallium/auxiliary/gallivm/lp_bld_arit.c        | 80
>>>  ++++++++--------------
>>>  src/gallium/auxiliary/gallivm/lp_bld_arit.h        |  7 --
>>>  src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c | 15 ----
>>>  3 files changed, 30 insertions(+), 72 deletions(-)
>>>
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> index 09107ff..00052ed 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.c
>>> @@ -3001,12 +3001,9 @@ const double lp_build_exp2_polynomial[] = {
>>>  };
>>>  
>>>  
>>> -void
>>> -lp_build_exp2_approx(struct lp_build_context *bld,
>>> -                     LLVMValueRef x,
>>> -                     LLVMValueRef *p_exp2_int_part,
>>> -                     LLVMValueRef *p_frac_part,
>>> -                     LLVMValueRef *p_exp2)
>>> +LLVMValueRef
>>> +lp_build_exp2(struct lp_build_context *bld,
>>> +              LLVMValueRef x)
>>>  {
>>>     LLVMBuilderRef builder = bld->gallivm->builder;
>>>     const struct lp_type type = bld->type;
>>> @@ -3019,65 +3016,48 @@ lp_build_exp2_approx(struct lp_build_context *bld,
>>>  
>>>     assert(lp_check_value(bld->type, x));
>>>  
>>> -   if(p_exp2_int_part || p_frac_part || p_exp2) {
>>> -      /* TODO: optimize the constant case */
>>> -      if (gallivm_debug & GALLIVM_DEBUG_PERF &&
>>> -          LLVMIsConstant(x)) {
>>> -         debug_printf("%s: inefficient/imprecise constant arithmetic\n",
>>> -                      __FUNCTION__);
>>> -      }
>>>  
>>> -      assert(type.floating && type.width == 32);
>>> +   /* TODO: optimize the constant case */
>>> +   if (gallivm_debug & GALLIVM_DEBUG_PERF &&
>>> +       LLVMIsConstant(x)) {
>>> +      debug_printf("%s: inefficient/imprecise constant arithmetic\n",
>>> +                   __FUNCTION__);
>>> +   }
>>>  
>>> -      /* We want to preserve NaN and make sure than for exp2 if x > 128,
>>> -       * the result is INF  and if it's smaller than -126.9 the result is
>>> 0 */
>>> -      x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,
>>> 128.0), x,
>>> -                           GALLIVM_NAN_RETURN_SECOND);
>>> -      x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type,
>>> -126.99999), x,
>>> -                           GALLIVM_NAN_RETURN_SECOND);
>>> +   assert(type.floating && type.width == 32);
>>>  
>>> -      /* ipart = floor(x) */
>>> -      /* fpart = x - ipart */
>>> -      lp_build_ifloor_fract(bld, x, &ipart, &fpart);
>>> -   }
>>> +   /* We want to preserve NaN and make sure than for exp2 if x > 128,
>>> +    * the result is INF  and if it's smaller than -126.9 the result is 0
>>> */
>>> +   x = lp_build_min_ext(bld, lp_build_const_vec(bld->gallivm, type,
>>> 128.0), x,
>>> +                        GALLIVM_NAN_RETURN_SECOND);
>>> +   x = lp_build_max_ext(bld, lp_build_const_vec(bld->gallivm, type,
>>> -126.99999), x,
>>> +                        GALLIVM_NAN_RETURN_SECOND);
>>>  
>>> -   if(p_exp2_int_part || p_exp2) {
>>> -      /* expipart = (float) (1 << ipart) */
>>> -      expipart = LLVMBuildAdd(builder, ipart,
>>> -                              lp_build_const_int_vec(bld->gallivm, type,
>>> 127), "");
>>> -      expipart = LLVMBuildShl(builder, expipart,
>>> -                              lp_build_const_int_vec(bld->gallivm, type,
>>> 23), "");
>>> -      expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
>>> -   }
>>> +   /* ipart = floor(x) */
>>> +   /* fpart = x - ipart */
>>> +   lp_build_ifloor_fract(bld, x, &ipart, &fpart);
>>>  
>>> -   if(p_exp2) {
>>> -      expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
>>> -                                     Elements(lp_build_exp2_polynomial));
>>>  
>>> -      res = LLVMBuildFMul(builder, expipart, expfpart, "");
>>> -   }
>>>  
>>> -   if(p_exp2_int_part)
>>> -      *p_exp2_int_part = expipart;
>>> +   /* expipart = (float) (1 << ipart) */
>>> +   expipart = LLVMBuildAdd(builder, ipart,
>>> +                           lp_build_const_int_vec(bld->gallivm, type,
>>> 127), "");
>>> +   expipart = LLVMBuildShl(builder, expipart,
>>> +                           lp_build_const_int_vec(bld->gallivm, type, 23),
>>> "");
>>> +   expipart = LLVMBuildBitCast(builder, expipart, vec_type, "");
>>>  
>>> -   if(p_frac_part)
>>> -      *p_frac_part = fpart;
>>>  
>>> -   if(p_exp2)
>>> -      *p_exp2 = res;
>>> -}
>>> +   expfpart = lp_build_polynomial(bld, fpart, lp_build_exp2_polynomial,
>>> +                                  Elements(lp_build_exp2_polynomial));
>>> +
>>> +   res = LLVMBuildFMul(builder, expipart, expfpart, "");
>>>  
>>>  
>>> -LLVMValueRef
>>> -lp_build_exp2(struct lp_build_context *bld,
>>> -              LLVMValueRef x)
>>> -{
>>> -   LLVMValueRef res;
>>> -   lp_build_exp2_approx(bld, x, NULL, NULL, &res);
>>>     return res;
>>>  }
>>>  
>>>  
>>> +
>>>  /**
>>>   * Extract the exponent of a IEEE-754 floating point value.
>>>   *
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> index d98025e..49d4e2c 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_arit.h
>>> @@ -326,13 +326,6 @@ lp_build_ilog2(struct lp_build_context *bld,
>>>                 LLVMValueRef x);
>>>  
>>>  void
>>> -lp_build_exp2_approx(struct lp_build_context *bld,
>>> -                     LLVMValueRef x,
>>> -                     LLVMValueRef *p_exp2_int_part,
>>> -                     LLVMValueRef *p_frac_part,
>>> -                     LLVMValueRef *p_exp2);
>>> -
>>> -void
>>>  lp_build_log2_approx(struct lp_build_context *bld,
>>>                       LLVMValueRef x,
>>>                       LLVMValueRef *p_exp,
>>> diff --git a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> index 86c3249..1cfaf78 100644
>>> --- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> +++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_action.c
>>> @@ -1057,20 +1057,6 @@ ex2_emit_cpu(
>>>                                                          emit_data->args[0]);
>>>  }
>>>  
>>> -/* TGSI_OPCODE_EXP (CPU Only) */
>>> -static void
>>> -exp_emit_cpu(
>>> -   const struct lp_build_tgsi_action * action,
>>> -   struct lp_build_tgsi_context * bld_base,
>>> -   struct lp_build_emit_data * emit_data)
>>> -{
>>> -   lp_build_exp2_approx(&bld_base->base, emit_data->args[0],
>>> -                        &emit_data->output[TGSI_CHAN_X],
>>> -                        &emit_data->output[TGSI_CHAN_Y],
>>> -                        &emit_data->output[TGSI_CHAN_Z]);
>>> -   emit_data->output[TGSI_CHAN_W] = bld_base->base.one;
>>> -}
>>> -
>>>  /* TGSI_OPCODE_F2I (CPU Only) */
>>>  static void
>>>  f2i_emit_cpu(
>>> @@ -1785,7 +1771,6 @@ lp_set_default_actions_cpu(
>>>     bld_base->op_actions[TGSI_OPCODE_CMP].emit = cmp_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_DIV].emit = div_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_EX2].emit = ex2_emit_cpu;
>>> -   bld_base->op_actions[TGSI_OPCODE_EXP].emit = exp_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_F2I].emit = f2i_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_F2U].emit = f2u_emit_cpu;
>>>     bld_base->op_actions[TGSI_OPCODE_FLR].emit = flr_emit_cpu;
>>>
>>