[Pixman] [PATCH 1/1 v2] vmx: workarounds to fix powerpc little endian particularities

Mon Jun 1 13:46:00 PDT 2015

I have made some changes to the file pixman-vmx.c, which uses vmx (aka altivec)
to optimize pixman. Basically, what I did:
 Changed vec_perm to now perform xor operation over the positions so it will
work regardless of endianness.
 Replaced usage of vec_mergeh, vec_mergel and vec_mladd by vec_mulo and
vec_mule plus vec_add and vec_perm. The result is the same and not affected
by endianness differences.
 Replaced usage of vec_lvsl to direct unaligned assignment operation (=). That
is because, according to Power ABI Specification, the usage of lvsl is
deprecated on ppc64le.
 Changed COMPUTE_SHIFT_{MASK,MASKS,MASKC} macro usage to no-op for powerpc
little endian since unaligned access is supported on ppc64le.
After those changes, all tests passed on ppc64, ppc64le and powerpc vms.

Signed-off-by: Fernando Seiti Furusato <ferseiti at linux.vnet.ibm.com>
---
 pixman/pixman-vmx.c | 106 +++++++++++++++++++++++++++++++++-------------------
 1 file changed, 67 insertions(+), 39 deletions(-)

diff --git a/pixman/pixman-vmx.c b/pixman/pixman-vmx.c
index c33631c..57918c1 100644
--- a/pixman/pixman-vmx.c
+++ b/pixman/pixman-vmx.c
@@ -37,46 +37,49 @@
 static force_inline vector unsigned int
 splat_alpha (vector unsigned int pix)
 {
-    return vec_perm (pix, pix,
-		     (vector unsigned char)AVV (
-			 0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
-			 0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+    union {
+	unsigned short s;
+	unsigned char c[2];
+    } endian_xor = {0x0300};
+
+    /* endian_xor.c[1] will be 3 if little endian and 0 if big endian */
+    vector unsigned char perm = vec_splat((vector unsigned char)
+					  AVV (endian_xor.c[1]),0);
+    perm = vec_xor (perm,(vector unsigned char) AVV (
+			  0x00, 0x00, 0x00, 0x00, 0x04, 0x04, 0x04, 0x04,
+			  0x08, 0x08, 0x08, 0x08, 0x0C, 0x0C, 0x0C, 0x0C));
+    return vec_perm (pix, pix, perm);
 }
 
 static force_inline vector unsigned int
 pix_multiply (vector unsigned int p, vector unsigned int a)
 {
-    vector unsigned short hi, lo, mod;
-
-    /* unpack to short */
-    hi = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)p);
-
-    mod = (vector unsigned short)
-	vec_mergeh ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-
-    hi = vec_mladd (hi, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
+    vector unsigned short hi, lo, even, odd;
+
+    /* unpack to short while multiplying p and a even positions */
+    even = vec_mule((vector unsigned char)p, (vector unsigned char)a);
+    even = vec_add(even, (vector unsigned short)AVV
+			 (0x0080, 0x0080, 0x0080, 0x0080,
+			  0x0080, 0x0080, 0x0080, 0x0080));
+
+    /* unpack to short while multiplying p and a odd positions */
+    odd = vec_mulo ((vector unsigned char)p, (vector unsigned char)a);
+    odd = vec_add (odd, (vector unsigned short)AVV 
+			(0x0080, 0x0080, 0x0080, 0x0080,
+			 0x0080, 0x0080, 0x0080, 0x0080));
+
+    /* change split from even and odd positions to high and low ends */
+    hi = vec_perm (even, odd, (vector unsigned char)AVV
+			     (0x00, 0x01, 0x10, 0x11, 0x02, 0x03, 0x12, 0x13,
+			      0x04, 0x05, 0x14, 0x15, 0x06, 0x07, 0x16, 0x17));
+    lo = vec_perm (even, odd, (vector unsigned char)AVV
+			     (0x08, 0x09, 0x18, 0x19, 0x0A, 0x0B, 0x1A, 0x1B,
+			      0x0C, 0x0D, 0x1C, 0x1D, 0x0E, 0x0F, 0x1E, 0x1F));
 
     hi = vec_adds (hi, vec_sr (hi, vec_splat_u16 (8)));
 
     hi = vec_sr (hi, vec_splat_u16 (8));
 
-    /* unpack to short */
-    lo = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)p);
-    mod = (vector unsigned short)
-	vec_mergel ((vector unsigned char)AVV (0),
-		    (vector unsigned char)a);
-
-    lo = vec_mladd (lo, mod, (vector unsigned short)
-                    AVV (0x0080, 0x0080, 0x0080, 0x0080,
-                         0x0080, 0x0080, 0x0080, 0x0080));
-
     lo = vec_adds (lo, vec_sr (lo, vec_splat_u16 (8)));
 
     lo = vec_sr (lo, vec_splat_u16 (8));
@@ -129,29 +132,26 @@ over (vector unsigned int src,
     over (pix_multiply (src, mask),					\
           pix_multiply (srca, mask), dest)
 
+#ifdef WORDS_BIGENDIAN
 
-#define COMPUTE_SHIFT_MASK(source)					\
+# define COMPUTE_SHIFT_MASK(source)					\
     source ## _mask = vec_lvsl (0, source);
 
-#define COMPUTE_SHIFT_MASKS(dest, source)				\
+# define COMPUTE_SHIFT_MASKS(dest, source)				\
     source ## _mask = vec_lvsl (0, source);
 
-#define COMPUTE_SHIFT_MASKC(dest, source, mask)				\
+# define COMPUTE_SHIFT_MASKC(dest, source, mask)			\
     mask ## _mask = vec_lvsl (0, mask);					\
     source ## _mask = vec_lvsl (0, source);
 
-/* notice you have to declare temp vars...
- * Note: tmp3 and tmp4 must remain untouched!
- */
-
-#define LOAD_VECTORS(dest, source)			  \
+# define LOAD_VECTORS(dest, source)			  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
     v ## source = (typeof(v ## source))			  \
 	vec_perm (tmp1, tmp2, source ## _mask);		  \
     v ## dest = (typeof(v ## dest))vec_ld (0, dest);
 
-#define LOAD_VECTORSC(dest, source, mask)		  \
+# define LOAD_VECTORSC(dest, source, mask)		  \
     tmp1 = (typeof(tmp1))vec_ld (0, source);		  \
     tmp2 = (typeof(tmp2))vec_ld (15, source);		  \
     v ## source = (typeof(v ## source))			  \
@@ -162,6 +162,34 @@ over (vector unsigned int src,
     v ## mask = (typeof(v ## mask))			  \
 	vec_perm (tmp1, tmp2, mask ## _mask);
 
+#else //WORDS_BIGENDIAN
+
+/* Now the COMPUTE_SHIFT_{MASK, MASKS, MASKC} below are just no-op.
+ * They are defined that way because little endian altivec can do unaligned
+ * reads natively and have no need for constructing the permutation pattern
+ * variables.
+ */
+# define COMPUTE_SHIFT_MASK(source)
+
+# define COMPUTE_SHIFT_MASKS(dest, source)
+
+# define COMPUTE_SHIFT_MASKC(dest, source, mask)
+
+# define LOAD_VECTORS(dest, source)                        \
+    v ## source = *((typeof(v ## source)*)source);        \
+    v ## dest = *((typeof(v ## dest)*)dest);
+
+# define LOAD_VECTORSC(dest, source, mask)                 \
+    v ## source = *((typeof(v ## source)*)source);        \
+    v ## dest = *((typeof(v ## dest)*)dest);              \
+    v ## mask = *((typeof(v ## mask)*)mask);
+
+#endif //WORDS_BIGENDIAN
+
+/* notice you have to declare temp vars...
+ * Note: tmp3 and tmp4 must remain untouched!
+ */
+
 #define LOAD_VECTORSM(dest, source, mask)				\
     LOAD_VECTORSC (dest, source, mask)					\
     v ## source = pix_multiply (v ## source,				\
-- 
2.1.4