[Mesa-stable] [PATCH] mesa: Use SSE prefetch instructions rather than 3DNow instructions

Sat Feb 6 00:51:26 CET 2016

I understand the need to change the "regular" x64 paths, but the 3DNow 
paths should not have changed.  From a bit of grepping and cursory 
examination of the code (and deep, dark memories), I think the 3DNow paths 
can still be used. There were either some specializations that only had 
3DNow versions or some early Athlon64 processors that couldn't use some of 
the SSE paths.

If the code is dead, it should be removed... not subjected to 
search-and-replace changes that don't actually make sense.  Right?

On February 5, 2016 11:23:12 AM Patrick Baggett <baggett.patrick at gmail.com> 
wrote:

> On Fri, Feb 5, 2016 at 1:15 PM, Ian Romanick <idr at freedesktop.org> wrote:
>> On 02/03/2016 01:54 PM, Timothy Arceri wrote:
>>> From: Patrick Baggett <baggett.patrick at gmail.com>
>>>
>>> 64-bit Pentium 4 CPUs don't have the 3DNow prefetch instructions
>>> which results in an Illegal instruction crash.
>>
>> I'll go out on a limb and say that Pentium4 CPUs never hit the functions
>> with 3dnow in the name.  I'll go even farther out on the limb and say
>> that all CPUs that do hit that path have the... uh... 3DNow prefetch
>> instructions.
>
> Ian,
>
> Just to provide some context, the SSE code paths here used PREFETCHW,
> which is a 3DNow instruction; this was the source of the crash. I
> would additionally guess (though I haven't verified) that the
> *_3dnow_* functions are never hit in x86-64 mode because usually code
> prefers SSE over 3DNow, and all x86-64 CPUs have SSE2.
>
> Patrick
>
>>
>>> Reviewed-by: Roland Scheidegger <sroland at vmware.com>
>>> Tested-by: Timothy Arceri <t_arceri at yahoo.com.au>
>>> https://bugs.freedesktop.org/show_bug.cgi?id=27512
>>> ---
>>>  src/mesa/x86-64/xform4.S | 40 ++++++++++++++++++++--------------------
>>>  1 file changed, 20 insertions(+), 20 deletions(-)
>>>
>>> diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
>>> index c185f62..b0aca19 100644
>>> --- a/src/mesa/x86-64/xform4.S
>>> +++ b/src/mesa/x86-64/xform4.S
>>> @@ -69,7 +69,7 @@ _mesa_x86_64_transform_points4_general:
>>>       movq V4F_START(%rdx), %rdx      /* ptr to first src vertex */
>>>       movq V4F_START(%rdi), %rdi      /* ptr to first dest vertex */
>>>
>>> -     prefetch 16(%rdx)
>>> +     prefetcht1 16(%rdx)
>>>
>>>       movaps 0(%rsi), %xmm4           /* m3  | m2  | m1  | m0  */
>>>       movaps 16(%rsi), %xmm5          /* m7  | m6  | m5  | m4  */
>>> @@ -80,7 +80,7 @@ _mesa_x86_64_transform_points4_general:
>>>  p4_general_loop:
>>>
>>>       movups (%rdx), %xmm8            /* ox | oy | oz | ow */
>>> -     prefetchw 16(%rdi)
>>> +     prefetcht1 16(%rdi)
>>>
>>>       pshufd $0x00, %xmm8, %xmm0      /* ox | ox | ox | ox */
>>>       addq %rax, %rdx
>>> @@ -93,7 +93,7 @@ p4_general_loop:
>>>       addps %xmm1, %xmm0              /* ox*m3+oy*m7 | ... */
>>>       mulps %xmm7, %xmm3              /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
>>>       addps %xmm2, %xmm0              /* ox*m3+oy*m7+oz*m11 | ... */
>>> -     prefetch 16(%rdx)
>>> +     prefetcht1 16(%rdx)
>>>       addps %xmm3, %xmm0              /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
>>>
>>>       movaps %xmm0, (%rdi)            /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
>>> @@ -150,7 +150,7 @@ _mesa_x86_64_transform_points4_3d:
>>>       movq V4F_START(%rdx), %rdx      /* ptr to first src vertex */
>>>       movq V4F_START(%rdi), %rdi      /* ptr to first dest vertex */
>>>
>>> -     prefetch 16(%rdx)
>>> +     prefetcht1 16(%rdx)
>>>
>>>       movaps 0(%rsi), %xmm4           /* m3  | m2  | m1  | m0  */
>>>       movaps 16(%rsi), %xmm5          /* m7  | m6  | m5  | m4  */
>>> @@ -166,7 +166,7 @@ _mesa_x86_64_transform_points4_3d:
>>>  p4_3d_loop:
>>>
>>>       movups (%rdx), %xmm8            /* ox | oy | oz | ow */
>>> -     prefetchw 16(%rdi)
>>> +     prefetcht1 16(%rdi)
>>>
>>>       pshufd $0x00, %xmm8, %xmm0      /* ox | ox | ox | ox */
>>>       addq %rax, %rdx
>>> @@ -179,7 +179,7 @@ p4_3d_loop:
>>>       addps %xmm1, %xmm0              /* ox*m3+oy*m7 | ... */
>>>       mulps %xmm7, %xmm3              /* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
>>>       addps %xmm2, %xmm0              /* ox*m3+oy*m7+oz*m11 | ... */
>>> -     prefetch 16(%rdx)
>>> +     prefetcht1 16(%rdx)
>>>       addps %xmm3, %xmm0              /* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
>>>
>>>       movaps %xmm0, (%rdi)            /* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
>>> @@ -210,8 +210,8 @@ _mesa_x86_64_transform_points4_identity:
>>>
>>>       movq V4F_START(%rdx), %rsi      /* ptr to first src vertex */
>>>       movq V4F_START(%rdi), %rdi      /* ptr to first dest vertex */
>>> -     prefetch 64(%rsi)
>>> -     prefetchw 64(%rdi)
>>> +     prefetcht1 64(%rsi)
>>> +     prefetcht1 64(%rdi)
>>>
>>>       add %ecx, %ecx
>>>
>>> @@ -242,7 +242,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
>>>       movq V4F_START(%rdx), %rdx      /* ptr to first src vertex */
>>>       movq V4F_START(%rdi), %rdi      /* ptr to first dest vertex */
>>>
>>> -     prefetch (%rdx)
>>> +     prefetcht1 (%rdx)
>>>
>>>       movd (%rsi), %mm0               /*                 | m00             */
>>>       .byte 0x66, 0x66, 0x90          /* manual align += 3 */
>>> @@ -255,7 +255,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
>>>
>>>  p4_3d_no_rot_loop:
>>>
>>> -     prefetchw 32(%rdi)
>>> +     prefetcht1 32(%rdi)
>>>
>>>       movq  (%rdx), %mm4              /* x1              | x0              */
>>>       movq  8(%rdx), %mm5             /* x3              | x2              */
>>> @@ -279,7 +279,7 @@ p4_3d_no_rot_loop:
>>>       addq $16, %rdi
>>>
>>>       decl %ecx
>>> -     prefetch 32(%rdx)
>>> +     prefetcht1 32(%rdx)
>>>       jnz p4_3d_no_rot_loop
>>>
>>>  p4_3d_no_rot_done:
>>> @@ -311,7 +311,7 @@ _mesa_3dnow_transform_points4_perspective:
>>>       punpckldq 20(%rsi), %mm0        /* m11             | m00             */
>>>
>>>       movq 32(%rsi), %mm2             /* m21             | m20             */
>>> -     prefetch (%rdx)
>>> +     prefetcht1 (%rdx)
>>>
>>>       movd 40(%rsi), %mm1             /*                 | m22             */
>>>
>>> @@ -321,7 +321,7 @@ _mesa_3dnow_transform_points4_perspective:
>>>
>>>  p4_perspective_loop:
>>>
>>> -     prefetchw 32(%rdi)              /* prefetch 2 vertices ahead         */
>>> +     prefetcht1 32(%rdi)             /* prefetch 2 vertices ahead         */
>>>
>>>       movq (%rdx), %mm4               /* x1              | x0              */
>>>       movq 8(%rdx), %mm5              /* x3              | x2              */
>>> @@ -347,7 +347,7 @@ p4_perspective_loop:
>>>       addq $16, %rdi
>>>
>>>       decl %ecx
>>> -     prefetch 32(%rdx)               /* hopefully stride is zero          */
>>> +     prefetcht1 32(%rdx)             /* hopefully stride is zero          */
>>>       jnz p4_perspective_loop
>>>
>>>  p4_perspective_done:
>>> @@ -374,14 +374,14 @@ _mesa_3dnow_transform_points4_2d_no_rot:
>>>       movq V4F_START(%rdi), %rdi      /* ptr to first dest vertex */
>>>
>>>       movd (%rsi), %mm0               /*                 | m00             */
>>> -     prefetch (%rdx)
>>> +     prefetcht1 (%rdx)
>>>       punpckldq 20(%rsi), %mm0        /* m11             | m00             */
>>>
>>>       movq 48(%rsi), %mm1             /* m31             | m30             */
>>>
>>>  p4_2d_no_rot_loop:
>>>
>>> -     prefetchw 32(%rdi)              /* prefetch 2 vertices ahead         */
>>> +     prefetcht1 32(%rdi)             /* prefetch 2 vertices ahead         */
>>>
>>>       movq (%rdx), %mm4               /* x1              | x0              */
>>>       movq 8(%rdx), %mm5              /* x3              | x2              */
>>> @@ -394,7 +394,7 @@ p4_2d_no_rot_loop:
>>>       addq %rax, %rdx
>>>       pfmul %mm1, %mm6                /* x3*m31          | x3*m30          */
>>>
>>> -     prefetch 32(%rdx)               /* hopefully stride is zero          */
>>> +     prefetcht1 32(%rdx)             /* hopefully stride is zero          */
>>>       pfadd %mm4, %mm6                /* x1*m11+x3*m31   | x0*m00+x3*m30   */
>>>
>>>       movq %mm6, (%rdi)               /* write r0, r1                      */
>>> @@ -433,7 +433,7 @@ _mesa_3dnow_transform_points4_2d:
>>>       movd (%rsi), %mm0               /*                 | m00             */
>>>       movd 4(%rsi), %mm1              /*                 | m01             */
>>>
>>> -     prefetch (%rdx)
>>> +     prefetcht1 (%rdx)
>>>
>>>       punpckldq 16(%rsi), %mm0        /* m10             | m00             */
>>>       .byte 0x66, 0x66, 0x90          /* manual align += 4 */
>>> @@ -443,7 +443,7 @@ _mesa_3dnow_transform_points4_2d:
>>>
>>>  p4_2d_loop:
>>>
>>> -     prefetchw 32(%rdi)              /* prefetch 2 vertices ahead         */
>>> +     prefetcht1 32(%rdi)             /* prefetch 2 vertices ahead         */
>>>
>>>       movq (%rdx), %mm3               /* x1              | x0              */
>>>       movq 8(%rdx), %mm5              /* x3              | x2              */
>>> @@ -460,7 +460,7 @@ p4_2d_loop:
>>>       pfacc %mm4, %mm3                /* x0*m01+x1*m11   | x0*m00+x1*m10   */
>>>
>>>       pfmul %mm2, %mm6                /* x3*m31          | x3*m30          */
>>> -     prefetch 32(%rdx)               /* hopefully stride is zero          */
>>> +     prefetcht1 32(%rdx)             /* hopefully stride is zero          */
>>>
>>>       pfadd %mm6, %mm3                /* r1              | r0              */
>>>
>>>
>>
> _______________________________________________
> mesa-stable mailing list
> mesa-stable at lists.freedesktop.org
> http://lists.freedesktop.org/mailman/listinfo/mesa-stable