[Mesa-dev] [PATCH] mesa: Use SSE prefetch instructions rather than 3DNow instructions

Wed Feb 3 21:35:49 UTC 2016

I don't see much point in replacing the ones in the _mesa_3dnow_xx
functions, but I suppose it's better than missing some, and it shouldn't
hurt neither.
(There are 3dnow capable cpus not supporting prefetcht1 but supporting
prefetch, but there aren't any x64_64 cpus not supporting prefetcht1, so
why not.)

Reviewed-by: Roland Scheidegger <sroland at vmware.com>

Am 03.02.2016 um 22:05 schrieb Timothy Arceri:
> From: Patrick Baggett <baggett.patrick at gmail.com>
> 
> 64-bit Pentium 4 CPUs don't have the 3DNow prefetch instructions
> which results in an Illegal instruction crash.
> 
> Cc: Roland Scheidegger <sroland at vmware.com>
> Tested-by: Timothy Arceri <t_arceri at yahoo.com.au>
> https://bugs.freedesktop.org/show_bug.cgi?id=27512
> ---
>  src/mesa/x86-64/xform4.S | 40 ++++++++++++++++++++--------------------
>  1 file changed, 20 insertions(+), 20 deletions(-)
> 
> diff --git a/src/mesa/x86-64/xform4.S b/src/mesa/x86-64/xform4.S
> index c185f62..b0aca19 100644
> --- a/src/mesa/x86-64/xform4.S
> +++ b/src/mesa/x86-64/xform4.S
> @@ -69,7 +69,7 @@ _mesa_x86_64_transform_points4_general:
>  	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
>  	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
>  
> -	prefetch 16(%rdx)
> +	prefetcht1 16(%rdx)
>  
>  	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
>  	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
> @@ -80,7 +80,7 @@ _mesa_x86_64_transform_points4_general:
>  p4_general_loop:
>  
>  	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
> -	prefetchw 16(%rdi)
> +	prefetcht1 16(%rdi)
>  
>  	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
>  	addq %rax, %rdx
> @@ -93,7 +93,7 @@ p4_general_loop:
>  	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
>  	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
>  	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
> -	prefetch 16(%rdx)
> +	prefetcht1 16(%rdx)
>  	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
>  
>  	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
> @@ -150,7 +150,7 @@ _mesa_x86_64_transform_points4_3d:
>  	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
>  	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
>  
> -	prefetch 16(%rdx)
> +	prefetcht1 16(%rdx)
>  
>  	movaps 0(%rsi), %xmm4		/* m3  | m2  | m1  | m0  */
>  	movaps 16(%rsi), %xmm5		/* m7  | m6  | m5  | m4  */
> @@ -166,7 +166,7 @@ _mesa_x86_64_transform_points4_3d:
>  p4_3d_loop:
>  
>  	movups (%rdx), %xmm8		/* ox | oy | oz | ow */
> -	prefetchw 16(%rdi)
> +	prefetcht1 16(%rdi)
>  
>  	pshufd $0x00, %xmm8, %xmm0	/* ox | ox | ox | ox */
>  	addq %rax, %rdx
> @@ -179,7 +179,7 @@ p4_3d_loop:
>  	addps %xmm1, %xmm0		/* ox*m3+oy*m7 | ... */
>  	mulps %xmm7, %xmm3		/* ow*m15 | ow*m14 | ow*m13 | ow*m12 */
>  	addps %xmm2, %xmm0		/* ox*m3+oy*m7+oz*m11 | ... */
> -	prefetch 16(%rdx)
> +	prefetcht1 16(%rdx)
>  	addps %xmm3, %xmm0		/* ox*m3+oy*m7+oz*m11+ow*m15 | ... */
>  
>  	movaps %xmm0, (%rdi)		/* ->D(3) | ->D(2) | ->D(1) | ->D(0) */
> @@ -210,8 +210,8 @@ _mesa_x86_64_transform_points4_identity:
>  
>  	movq V4F_START(%rdx), %rsi	/* ptr to first src vertex */
>  	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
> -	prefetch 64(%rsi)
> -	prefetchw 64(%rdi)
> +	prefetcht1 64(%rsi)
> +	prefetcht1 64(%rdi)
>  
>  	add %ecx, %ecx
>  
> @@ -242,7 +242,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
>  	movq V4F_START(%rdx), %rdx	/* ptr to first src vertex */
>  	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
>  
> -	prefetch (%rdx)
> +	prefetcht1 (%rdx)
>  	
>  	movd (%rsi), %mm0		/*                 | m00             */
>  	.byte 0x66, 0x66, 0x90	        /* manual align += 3 */
> @@ -255,7 +255,7 @@ _mesa_3dnow_transform_points4_3d_no_rot:
>  
>  p4_3d_no_rot_loop:
>  
> -	prefetchw 32(%rdi)
> +	prefetcht1 32(%rdi)
>  	
>  	movq  (%rdx), %mm4		/* x1              | x0              */
>  	movq  8(%rdx), %mm5		/* x3              | x2              */
> @@ -279,7 +279,7 @@ p4_3d_no_rot_loop:
>  	addq $16, %rdi
>  	
>  	decl %ecx
> -	prefetch 32(%rdx)
> +	prefetcht1 32(%rdx)
>  	jnz p4_3d_no_rot_loop
>  
>  p4_3d_no_rot_done:
> @@ -311,7 +311,7 @@ _mesa_3dnow_transform_points4_perspective:
>  	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
>  	
>  	movq 32(%rsi), %mm2		/* m21             | m20             */
> -	prefetch (%rdx)
> +	prefetcht1 (%rdx)
>  	
>  	movd 40(%rsi), %mm1		/*                 | m22             */
>  
> @@ -321,7 +321,7 @@ _mesa_3dnow_transform_points4_perspective:
>  
>  p4_perspective_loop:
>  
> -	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
> +	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
>  
>  	movq (%rdx), %mm4		/* x1              | x0              */
>  	movq 8(%rdx), %mm5		/* x3              | x2              */
> @@ -347,7 +347,7 @@ p4_perspective_loop:
>  	addq $16, %rdi
>  
>  	decl %ecx
> -	prefetch 32(%rdx)		/* hopefully stride is zero          */
> +	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
>  	jnz p4_perspective_loop
>  
>  p4_perspective_done:
> @@ -374,14 +374,14 @@ _mesa_3dnow_transform_points4_2d_no_rot:
>  	movq V4F_START(%rdi), %rdi	/* ptr to first dest vertex */
>  
>  	movd (%rsi), %mm0		/*                 | m00             */
> -	prefetch (%rdx)
> +	prefetcht1 (%rdx)
>  	punpckldq 20(%rsi), %mm0	/* m11             | m00             */
>  	
>  	movq 48(%rsi), %mm1		/* m31             | m30             */
>  
>  p4_2d_no_rot_loop:
>  
> -	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
> +	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
>  
>  	movq (%rdx), %mm4		/* x1              | x0              */
>  	movq 8(%rdx), %mm5		/* x3              | x2              */
> @@ -394,7 +394,7 @@ p4_2d_no_rot_loop:
>  	addq %rax, %rdx	
>  	pfmul %mm1, %mm6		/* x3*m31          | x3*m30          */
>  
> -	prefetch 32(%rdx)		/* hopefully stride is zero          */
> +	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
>  	pfadd %mm4, %mm6		/* x1*m11+x3*m31   | x0*m00+x3*m30   */
>  
>  	movq %mm6, (%rdi)		/* write r0, r1                      */
> @@ -433,7 +433,7 @@ _mesa_3dnow_transform_points4_2d:
>  	movd (%rsi), %mm0		/*                 | m00             */
>  	movd 4(%rsi), %mm1		/*                 | m01             */
>  
> -	prefetch (%rdx)
> +	prefetcht1 (%rdx)
>  
>  	punpckldq 16(%rsi), %mm0	/* m10             | m00             */
>  	.byte 0x66, 0x66, 0x90		/* manual align += 4 */
> @@ -443,7 +443,7 @@ _mesa_3dnow_transform_points4_2d:
>  
>  p4_2d_loop:
>  
> -	prefetchw 32(%rdi)		/* prefetch 2 vertices ahead         */
> +	prefetcht1 32(%rdi)		/* prefetch 2 vertices ahead         */
>  
>  	movq (%rdx), %mm3		/* x1              | x0              */
>  	movq 8(%rdx), %mm5		/* x3              | x2              */
> @@ -460,7 +460,7 @@ p4_2d_loop:
>  	pfacc %mm4, %mm3		/* x0*m01+x1*m11   | x0*m00+x1*m10   */
>  
>  	pfmul %mm2, %mm6		/* x3*m31          | x3*m30          */
> -	prefetch 32(%rdx)		/* hopefully stride is zero          */
> +	prefetcht1 32(%rdx)		/* hopefully stride is zero          */
>  
>  	pfadd %mm6, %mm3		/* r1              | r0              */
>  
>