[Mesa-dev] [PATCH 5/7] translate_sse: Preserve low bit during unsigned -> float conversion.

Mon Apr 14 08:30:41 PDT 2014

I am not too familiar with translate sse code, so someone else should
probably look at it. The rest of the series looks good to me, a comment
inline.

Am 13.04.2014 22:29, schrieb Andreas Hartmetz:
> From: Andreas Hartmetz <andreas.hartmetz at kdab.com>
> 
> ---
>  src/gallium/auxiliary/translate/translate_sse.c | 35 ++++++++++++++++++++-----
>  1 file changed, 28 insertions(+), 7 deletions(-)
> 
> diff --git a/src/gallium/auxiliary/translate/translate_sse.c b/src/gallium/auxiliary/translate/translate_sse.c
> index dace722..03d8276 100644
> --- a/src/gallium/auxiliary/translate/translate_sse.c
> +++ b/src/gallium/auxiliary/translate/translate_sse.c
> @@ -71,7 +71,7 @@ struct translate_group
>  
>  #define ELEMENT_BUFFER_INSTANCE_ID  1001
>  
> -#define NUM_CONSTS 7
> +#define NUM_CONSTS 8
>  
>  enum
>  {
> @@ -81,6 +81,7 @@ enum
>     CONST_INV_32767,
>     CONST_INV_65535,
>     CONST_INV_2147483647,
> +   CONST_INV_4294967295,
>     CONST_255
>  };
>  
> @@ -92,6 +93,7 @@ static float consts[NUM_CONSTS][4] = {
>     C(1.0 / 32767.0),
>     C(1.0 / 65535.0),
>     C(1.0 / 2147483647.0),
> +   C(1.0 / 4294967295.0),
>     C(255.0)
>  };
>  
> @@ -515,6 +517,7 @@ translate_attr_convert(struct translate_sse *p,
>          || a->output_format == PIPE_FORMAT_R32G32B32_FLOAT
>          || a->output_format == PIPE_FORMAT_R32G32B32A32_FLOAT)) {
>        struct x86_reg dataXMM = x86_make_reg(file_XMM, 0);
> +      struct x86_reg dataXMM2 = x86_make_reg(file_XMM, 1);
>  
>        for (i = 0; i < output_desc->nr_channels; ++i) {
>           if (swizzle[i] == UTIL_FORMAT_SWIZZLE_0
> @@ -546,17 +549,38 @@ translate_attr_convert(struct translate_sse *p,
>                  */
>                 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
>                 sse2_punpcklbw(p->func, dataXMM, get_const(p, CONST_IDENTITY));
> +               sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
>                 break;
>              case 16:
>                 sse2_punpcklwd(p->func, dataXMM, get_const(p, CONST_IDENTITY));
> +               sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
>                 break;
> -            case 32:           /* we lose precision here */
> +            case 32:           /* we lose precision if value > 2^23 - 1 */
> +               /* ...but try to keep all bits for value <= 2^23 - 1 */
> +               /* this could be done much smarter for 1 or 2 dwords of input
> +                * on X64, using sth like "cvtsi2ss xmm0, rax" (note *R*ax) */
> +
> +               /* save the low bit */
> +               sse_movss(p->func, dataXMM2, dataXMM);
I don't think that's right, this code can handle multiple channels, i.e.
true vectors hence you need sse2_movdqa (or sse_movaps but the former
should cause less domain transitions).

> +               /* right shift & convert, losing the low bit - must clear
> +                * high bit because there is no unsigned convert instruction */
>                 sse2_psrld_imm(p->func, dataXMM, 1);
> +               sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
> +
> +               /* convert low bit to float */
> +               sse2_pslld_imm(p->func, dataXMM2, 31);
> +               sse2_psrld_imm(p->func, dataXMM2, 31);
> +               sse2_cvtdq2ps(p->func, dataXMM2, dataXMM2);
Is this really ideal, wouldn't something like (in horrible pseudo-code
notation)
dataXMM2 = dataXMM2 & CONST(0x1 vec)
dataXMM2 = cvtdq2ps(dataXMM2)
be faster?
I guess though your method avoids the constant, so probably not worth
bothering (I am actually wondering what code llvm generates for its
UIToFP instruction or what in general the fastest way to do this is).

Roland

> +
> +               /* mostly undo the right-shift by 1 */
> +               sse_addps(p->func, dataXMM, dataXMM);
> +               /* add back the lost low bit */
> +               sse_addps(p->func, dataXMM, dataXMM2);
>                 break;
>              default:
>                 return FALSE;
>              }
> -            sse2_cvtdq2ps(p->func, dataXMM, dataXMM);
> +
>              if (input_desc->channel[0].normalized) {
>                 struct x86_reg factor;
>                 switch (input_desc->channel[0].size) {
> @@ -567,7 +591,7 @@ translate_attr_convert(struct translate_sse *p,
>                    factor = get_const(p, CONST_INV_65535);
>                    break;
>                 case 32:
> -                  factor = get_const(p, CONST_INV_2147483647);
> +                  factor = get_const(p, CONST_INV_4294967295);
>                    break;
>                 default:
>                    assert(0);
> @@ -579,9 +603,6 @@ translate_attr_convert(struct translate_sse *p,
>                 }
>                 sse_mulps(p->func, dataXMM, factor);
>              }
> -            else if (input_desc->channel[0].size == 32)
> -               /* compensate for the bit we threw away to fit u32 into s32 */
> -               sse_addps(p->func, dataXMM, dataXMM);
>              break;
>           case UTIL_FORMAT_TYPE_SIGNED:
>              if (!(x86_target_caps(p->func) & X86_SSE2))
>