[Mesa-dev] [PATCH][RFC] Clamp rgba floats with sse
Juha-Pekka Heikkila
juhapekka.heikkila at gmail.com
Fri Oct 31 03:13:09 PDT 2014
I was motivated to send this to list after seeing Timothy Arceri's
add SSE optimization for glDrawElements patch. This patch is not very
nice with its ifdefs which are needed to get gcc to optimize loops
when possible and still avoid sse code where at the minimum sse2 is
not present.
What this causes is in _mesa_apply_rgba_transfer_ops clamping inner loop to
change from this (I had -march=native during compilation thus see avx opcodes
here):
0x00007ffff3dbe430 <+128>: vmovss (%rcx),%xmm2
0x00007ffff3dbe434 <+132>: vmovaps %xmm1,%xmm3
0x00007ffff3dbe438 <+136>: vucomiss %xmm2,%xmm0
0x00007ffff3dbe43c <+140>: ja 0x7ffff3dbe442 <_mesa_apply_rgba_transfer_ops+146>
0x00007ffff3dbe43e <+142>: vminss %xmm2,%xmm4,%xmm3
0x00007ffff3dbe442 <+146>: vmovss 0x4(%rcx),%xmm2
0x00007ffff3dbe447 <+151>: vmovss %xmm3,(%rcx)
0x00007ffff3dbe44b <+155>: vmovaps %xmm1,%xmm3
0x00007ffff3dbe44f <+159>: vucomiss %xmm2,%xmm0
0x00007ffff3dbe453 <+163>: ja 0x7ffff3dbe459 <_mesa_apply_rgba_transfer_ops+169>
0x00007ffff3dbe455 <+165>: vminss %xmm2,%xmm7,%xmm3
0x00007ffff3dbe459 <+169>: vmovss 0x8(%rcx),%xmm2
0x00007ffff3dbe45e <+174>: vmovss %xmm3,0x4(%rcx)
0x00007ffff3dbe463 <+179>: vmovaps %xmm1,%xmm3
0x00007ffff3dbe467 <+183>: vucomiss %xmm2,%xmm0
0x00007ffff3dbe46b <+187>: ja 0x7ffff3dbe471 <_mesa_apply_rgba_transfer_ops+193>
0x00007ffff3dbe46d <+189>: vminss %xmm2,%xmm6,%xmm3
0x00007ffff3dbe471 <+193>: vmovss 0xc(%rcx),%xmm2
0x00007ffff3dbe476 <+198>: vmovss %xmm3,0x8(%rcx)
0x00007ffff3dbe47b <+203>: vmovaps %xmm1,%xmm3
0x00007ffff3dbe47f <+207>: vucomiss %xmm2,%xmm0
0x00007ffff3dbe483 <+211>: ja 0x7ffff3dbe489 <_mesa_apply_rgba_transfer_ops+217>
0x00007ffff3dbe485 <+213>: vminss %xmm2,%xmm5,%xmm3
0x00007ffff3dbe489 <+217>: vmovss %xmm3,0xc(%rcx)
0x00007ffff3dbe48e <+222>: add $0x10,%rcx
0x00007ffff3dbe492 <+226>: cmp %rax,%rcx
0x00007ffff3dbe495 <+229>: jne 0x7ffff3dbe430 <_mesa_apply_rgba_transfer_ops+128>
into this:
0x00007ffff3dbe4d0 <+288>: vmovups (%rcx),%xmm0
0x00007ffff3dbe4d4 <+292>: add $0x10,%rcx
0x00007ffff3dbe4d8 <+296>: vmaxps %xmm1,%xmm0,%xmm0
0x00007ffff3dbe4dc <+300>: vminps %xmm2,%xmm0,%xmm0
0x00007ffff3dbe4e0 <+304>: vmovups %xmm0,-0x10(%rcx)
0x00007ffff3dbe4e5 <+309>: cmp %rax,%rcx
0x00007ffff3dbe4e8 <+312>: jne 0x7ffff3dbe4d0 <_mesa_apply_rgba_transfer_ops+288>
The two pieces of code do the same thing. Places where this does really help
is for example Android home screen.
/Juha-Pekka
Juha-Pekka Heikkila (1):
mesa/main: Clamp rgba with streamed sse
src/mesa/main/colormac.h | 20 +++++++++++++++
src/mesa/main/pixeltransfer.c | 59 ++++++++++++++++++++++++++++++++-----------
2 files changed, 64 insertions(+), 15 deletions(-)
--
1.8.5.1
More information about the mesa-dev
mailing list