[Mesa-dev] [PATCH][RFC] Clamp rgba floats with sse

Juha-Pekka Heikkila juhapekka.heikkila at gmail.com
Fri Oct 31 03:13:09 PDT 2014

I was motivated to send this to list after seeing Timothy Arceri's
add SSE optimization for glDrawElements patch. This patch is not very
nice with its ifdefs which are needed to get gcc to optimize loops
when possible and still avoid sse code where at the minimum sse2 is 
not present.

What this causes is in _mesa_apply_rgba_transfer_ops clamping inner loop to
change from this (I had -march=native during compilation thus see avx opcodes

   0x00007ffff3dbe430 <+128>:	vmovss (%rcx),%xmm2
   0x00007ffff3dbe434 <+132>:	vmovaps %xmm1,%xmm3
   0x00007ffff3dbe438 <+136>:	vucomiss %xmm2,%xmm0
   0x00007ffff3dbe43c <+140>:	ja     0x7ffff3dbe442 <_mesa_apply_rgba_transfer_ops+146>
   0x00007ffff3dbe43e <+142>:	vminss %xmm2,%xmm4,%xmm3
   0x00007ffff3dbe442 <+146>:	vmovss 0x4(%rcx),%xmm2
   0x00007ffff3dbe447 <+151>:	vmovss %xmm3,(%rcx)
   0x00007ffff3dbe44b <+155>:	vmovaps %xmm1,%xmm3
   0x00007ffff3dbe44f <+159>:	vucomiss %xmm2,%xmm0
   0x00007ffff3dbe453 <+163>:	ja     0x7ffff3dbe459 <_mesa_apply_rgba_transfer_ops+169>
   0x00007ffff3dbe455 <+165>:	vminss %xmm2,%xmm7,%xmm3
   0x00007ffff3dbe459 <+169>:	vmovss 0x8(%rcx),%xmm2
   0x00007ffff3dbe45e <+174>:	vmovss %xmm3,0x4(%rcx)
   0x00007ffff3dbe463 <+179>:	vmovaps %xmm1,%xmm3
   0x00007ffff3dbe467 <+183>:	vucomiss %xmm2,%xmm0
   0x00007ffff3dbe46b <+187>:	ja     0x7ffff3dbe471 <_mesa_apply_rgba_transfer_ops+193>
   0x00007ffff3dbe46d <+189>:	vminss %xmm2,%xmm6,%xmm3
   0x00007ffff3dbe471 <+193>:	vmovss 0xc(%rcx),%xmm2
   0x00007ffff3dbe476 <+198>:	vmovss %xmm3,0x8(%rcx)
   0x00007ffff3dbe47b <+203>:	vmovaps %xmm1,%xmm3
   0x00007ffff3dbe47f <+207>:	vucomiss %xmm2,%xmm0
   0x00007ffff3dbe483 <+211>:	ja     0x7ffff3dbe489 <_mesa_apply_rgba_transfer_ops+217>
   0x00007ffff3dbe485 <+213>:	vminss %xmm2,%xmm5,%xmm3
   0x00007ffff3dbe489 <+217>:	vmovss %xmm3,0xc(%rcx)
   0x00007ffff3dbe48e <+222>:	add    $0x10,%rcx
   0x00007ffff3dbe492 <+226>:	cmp    %rax,%rcx
   0x00007ffff3dbe495 <+229>:	jne    0x7ffff3dbe430 <_mesa_apply_rgba_transfer_ops+128>

into this:

   0x00007ffff3dbe4d0 <+288>:	vmovups (%rcx),%xmm0
   0x00007ffff3dbe4d4 <+292>:	add    $0x10,%rcx
   0x00007ffff3dbe4d8 <+296>:	vmaxps %xmm1,%xmm0,%xmm0
   0x00007ffff3dbe4dc <+300>:	vminps %xmm2,%xmm0,%xmm0
   0x00007ffff3dbe4e0 <+304>:	vmovups %xmm0,-0x10(%rcx)
   0x00007ffff3dbe4e5 <+309>:	cmp    %rax,%rcx
   0x00007ffff3dbe4e8 <+312>:	jne    0x7ffff3dbe4d0 <_mesa_apply_rgba_transfer_ops+288>

The two pieces of code do the same thing. Places where this does really help
is for example Android home screen.


Juha-Pekka Heikkila (1):
  mesa/main: Clamp rgba with streamed sse

 src/mesa/main/colormac.h      | 20 +++++++++++++++
 src/mesa/main/pixeltransfer.c | 59 ++++++++++++++++++++++++++++++++-----------
 2 files changed, 64 insertions(+), 15 deletions(-)


More information about the mesa-dev mailing list